Initial commit

2026-03-14 21:31:00 +08:00
parent 58dfd5548d
commit b77c37cfa5
9 changed files with 478 additions and 0 deletions
--- a/.dvc/config
+++ b/.dvc/config
@@ -0,0 +1,4 @@
 [core]
    remote = myremote
 ['remote "myremote"']
    url = /mnt/safe/dvc-remote
--- a/.github/workflows/cml.yaml
+++ b/.github/workflows/cml.yaml
@@ -0,0 +1,63 @@
 name: CML Report
 on: pull_request
 jobs:
  run:
    runs-on: [ubuntu-latest]
    steps:
      - uses: iterative/setup-cml@v2
      - uses: iterative/setup-dvc@v1
      - uses: actions/checkout@v3
        with:
          fetch-depth: 2
      # Needed for https://github.com/iterative/example-repos-dev/issues/225
      - name: Installs JSON5
        run: npm install -g json5
      - name: Generate metrics report
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cml ci
          if [ $GITHUB_REF = refs/heads/main ]; then
            PREVIOUS_REF=HEAD~1
          else
            PREVIOUS_REF=main
            git fetch origin main:main
          fi
          dvc pull eval
          dvc plots diff $PREVIOUS_REF workspace \
            --show-vega --targets ROC | json5 > vega.json
          vl2svg vega.json roc.svg
          dvc plots diff $PREVIOUS_REF workspace \
            --show-vega --targets Precision-Recall | json5 > vega.json
          vl2svg vega.json prc.svg
          dvc plots diff $PREVIOUS_REF workspace \
            --show-vega --targets Confusion-Matrix | json5 > vega.json
          vl2svg vega.json confusion.svg
          cp eval/plots/images/importance.png importance_workspace.png
          git checkout $PREVIOUS_REF -- dvc.lock
          cp eval/plots/images/importance.png importance_previous.png
          dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
          cat <<EOF > report.md
          # CML Report
          ## Plots
          ![ROC](./roc.svg)
          ![Precision-Recall](./prc.svg)
          ![Confusion Matrix](./confusion.svg)
          #### Feature Importance: ${PREVIOUS_REF}
          ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
          #### Feature Importance: workspace
          ![Feature Importance: workspace](./importance_workspace.png)
          ## Metrics and Params
          ### ${PREVIOUS_REF} → workspace
          ${dvc_report}
          EOF
          cml comment create --publish --pr=false report.md
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 .venv
--- a/params.yaml
+++ b/params.yaml
@@ -0,0 +1,13 @@
 prepare:
  split: 0.20
  seed: 20170428
 featurize:
  max_features: 100
  ngrams: 1
 train:
  seed: 20170428
  n_est: 50
  min_split: 0.01
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -0,0 +1,112 @@
 import json
 import math
 import os
 import pickle
 import sys
 import pandas as pd
 from sklearn import metrics
 from sklearn import tree
 from dvclive import Live
 from matplotlib import pyplot as plt
 def evaluate(model, matrix, split, live, save_path):
    """
    Dump all evaluation metrics and plots for given datasets.
    Args:
        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
        matrix (scipy.sparse.csr_matrix): Input matrix.
        split (str): Dataset name.
        live (dvclive.Live): Dvclive instance.
        save_path (str): Path to save the metrics.
    """
    labels = matrix[:, 1].toarray().astype(int)
    x = matrix[:, 2:]
    predictions_by_class = model.predict_proba(x)
    predictions = predictions_by_class[:, 1]
    # Use dvclive to log a few simple metrics...
    avg_prec = metrics.average_precision_score(labels, predictions)
    roc_auc = metrics.roc_auc_score(labels, predictions)
    if not live.summary:
        live.summary = {"avg_prec": {}, "roc_auc": {}}
    live.summary["avg_prec"][split] = avg_prec
    live.summary["roc_auc"][split] = roc_auc
    # ... and plots...
    # ... like an roc plot...
    live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
    # ... and precision recall plot...
    # ... which passes `drop_intermediate=True` to the sklearn method...
    live.log_sklearn_plot(
        "precision_recall",
        labels,
        predictions,
        name=f"prc/{split}",
        drop_intermediate=True,
    )
    # ... and confusion matrix plot
    live.log_sklearn_plot(
        "confusion_matrix",
        labels.squeeze(),
        predictions_by_class.argmax(-1),
        name=f"cm/{split}",
    )
 def save_importance_plot(live, model, feature_names):
    """
    Save feature importance plot.
    Args:
        live (dvclive.Live): DVCLive instance.
        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
        feature_names (list): List of feature names.
    """
    fig, axes = plt.subplots(dpi=100)
    fig.subplots_adjust(bottom=0.2, top=0.95)
    axes.set_ylabel("Mean decrease in impurity")
    importances = model.feature_importances_
    forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
    forest_importances.plot.bar(ax=axes)
    live.log_image("importance.png", fig)
 def main():
    EVAL_PATH = "eval"
    if len(sys.argv) != 3:
        sys.stderr.write("Arguments error. Usage:\n")
        sys.stderr.write("\tpython evaluate.py model features\n")
        sys.exit(1)
    model_file = sys.argv[1]
    train_file = os.path.join(sys.argv[2], "train.pkl")
    test_file = os.path.join(sys.argv[2], "test.pkl")
    # Load model and data.
    with open(model_file, "rb") as fd:
        model = pickle.load(fd)
    with open(train_file, "rb") as fd:
        train, feature_names = pickle.load(fd)
    with open(test_file, "rb") as fd:
        test, _ = pickle.load(fd)
    # Evaluate train and test datasets.
    with Live(EVAL_PATH) as live:
        evaluate(model, train, "train", live, save_path=EVAL_PATH)
        evaluate(model, test, "test", live, save_path=EVAL_PATH)
        # Dump feature importance plot.
        save_importance_plot(live, model, feature_names)
 if __name__ == "__main__":
    main()
--- a/src/featurization.py
+++ b/src/featurization.py
@@ -0,0 +1,136 @@
 import os
 import pickle
 import sys
 import numpy as np
 import pandas as pd
 import scipy.sparse as sparse
 import yaml
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 def get_df(data):
    """Read the input data file and return a data frame."""
    df = pd.read_csv(
        data,
        encoding="utf-8",
        header=None,
        delimiter="\t",
        names=["id", "label", "text"],
    )
    sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
    return df
 def save_matrix(df, matrix, names, output):
    """
    Save the matrix to a pickle file.
    Args:
        df (pandas.DataFrame): Input data frame.
        matrix (scipy.sparse.csr_matrix): Input matrix.
        names (list): List of feature names.
        output (str): Output file name.
    """
    id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
    label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
    result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
    msg = "The output matrix {} size is {} and data type is {}\n"
    sys.stderr.write(msg.format(output, result.shape, result.dtype))
    with open(output, "wb") as fd:
        pickle.dump((result, names), fd)
    pass
 def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
    """
    Generate train feature matrix.
    Args:
        train_input (str): Train input file name.
        train_output (str): Train output file name.
        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
    """
    df_train = get_df(train_input)
    train_words = np.array(df_train.text.str.lower().values)
    bag_of_words.fit(train_words)
    train_words_binary_matrix = bag_of_words.transform(train_words)
    feature_names = bag_of_words.get_feature_names_out()
    tfidf.fit(train_words_binary_matrix)
    train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
    save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
 def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
    """
    Generate test feature matrix.
    Args:
        test_input (str): Test input file name.
        test_output (str): Test output file name.
        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
    """
    df_test = get_df(test_input)
    test_words = np.array(df_test.text.str.lower().values)
    test_words_binary_matrix = bag_of_words.transform(test_words)
    test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
    feature_names = bag_of_words.get_feature_names_out()
    save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
 def main():
    params = yaml.safe_load(open("params.yaml"))["featurize"]
    np.set_printoptions(suppress=True)
    if len(sys.argv) != 3 and len(sys.argv) != 5:
        sys.stderr.write("Arguments error. Usage:\n")
        sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
        sys.exit(1)
    in_path = sys.argv[1]
    out_path = sys.argv[2]
    train_input = os.path.join(in_path, "train.tsv")
    test_input = os.path.join(in_path, "test.tsv")
    train_output = os.path.join(out_path, "train.pkl")
    test_output = os.path.join(out_path, "test.pkl")
    max_features = params["max_features"]
    ngrams = params["ngrams"]
    os.makedirs(out_path, exist_ok=True)
    bag_of_words = CountVectorizer(
        stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
    )
    tfidf = TfidfTransformer(smooth_idf=False)
    generate_and_save_train_features(
        train_input=train_input,
        train_output=train_output,
        bag_of_words=bag_of_words,
        tfidf=tfidf,
    )
    generate_and_save_test_features(
        test_input=test_input,
        test_output=test_output,
        bag_of_words=bag_of_words,
        tfidf=tfidf,
    )
 if __name__ == "__main__":
    main()
--- a/src/prepare.py
+++ b/src/prepare.py
@@ -0,0 +1,78 @@
 import os
 import random
 import re
 import sys
 import xml.etree.ElementTree
 import yaml
 def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
    """
    Process the input lines and write the output to the output files.
    Args:
        input_lines (list): List of input lines.
        fd_out_train (file): Output file for the training data set.
        fd_out_test (file): Output file for the test data set.
        target_tag (str): Target tag.
        split (float): Test data set split ratio.
    """
    num = 1
    for line in input_lines:
        try:
            fd_out = fd_out_train if random.random() > split else fd_out_test
            attr = xml.etree.ElementTree.fromstring(line).attrib
            pid = attr.get("Id", "")
            label = 1 if target_tag in attr.get("Tags", "") else 0
            title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
            body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
            text = title + " " + body
            fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
            num += 1
        except Exception as ex:
            sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
 def main():
    params = yaml.safe_load(open("params.yaml"))["prepare"]
    if len(sys.argv) != 2:
        sys.stderr.write("Arguments error. Usage:\n")
        sys.stderr.write("\tpython prepare.py data-file\n")
        sys.exit(1)
    # Test data set split ratio
    split = params["split"]
    random.seed(params["seed"])
    input = sys.argv[1]
    output_train = os.path.join("data", "prepared", "train.tsv")
    output_test = os.path.join("data", "prepared", "test.tsv")
    os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
    input_lines = []
    with open(input) as fd_in:
        input_lines = fd_in.readlines()
    fd_out_train = open(output_train, "w", encoding="utf-8")
    fd_out_test = open(output_test, "w", encoding="utf-8")
    process_posts(
        input_lines=input_lines,
        fd_out_train=fd_out_train,
        fd_out_test=fd_out_test,
        target_tag="<r>",
        split=split,
    )
    fd_out_train.close()
    fd_out_test.close()
 if __name__ == "__main__":
    main()
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -0,0 +1,6 @@
 dvclive>=3.0
 pandas
 pyaml
 scikit-learn>=1.3
 scipy
 matplotlib
--- a/src/train.py
+++ b/src/train.py
@@ -0,0 +1,65 @@
 import os
 import pickle
 import sys
 import numpy as np
 import yaml
 from sklearn.ensemble import RandomForestClassifier
 def train(seed, n_est, min_split, matrix):
    """
    Train a random forest classifier.
    Args:
        seed (int): Random seed.
        n_est (int): Number of trees in the forest.
        min_split (int): Minimum number of samples required to split an internal node.
        matrix (scipy.sparse.csr_matrix): Input matrix.
    Returns:
        sklearn.ensemble.RandomForestClassifier: Trained classifier.
    """
    labels = np.squeeze(matrix[:, 1].toarray())
    x = matrix[:, 2:]
    sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
    sys.stderr.write("X matrix size {}\n".format(x.shape))
    sys.stderr.write("Y matrix size {}\n".format(labels.shape))
    clf = RandomForestClassifier(
        n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
    )
    clf.fit(x, labels)
    return clf
 def main():
    params = yaml.safe_load(open("params.yaml"))["train"]
    if len(sys.argv) != 3:
        sys.stderr.write("Arguments error. Usage:\n")
        sys.stderr.write("\tpython train.py features model\n")
        sys.exit(1)
    input = sys.argv[1]
    output = sys.argv[2]
    seed = params["seed"]
    n_est = params["n_est"]
    min_split = params["min_split"]
    # Load the data
    with open(os.path.join(input, "train.pkl"), "rb") as fd:
        matrix, _ = pickle.load(fd)
    clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
    # Save the model
    with open(output, "wb") as fd:
        pickle.dump(clf, fd)
 if __name__ == "__main__":
    main()