Initial commit

2026-03-14 21:31:00 +08:00
parent 58dfd5548d
commit b77c37cfa5
9 changed files with 478 additions and 0 deletions
--- a/.dvc/config
+++ b/.dvc/config
@@ -0,0 +1,4 @@
+[core]
+    remote = myremote
+['remote "myremote"']
+    url = /mnt/safe/dvc-remote
--- a/.github/workflows/cml.yaml
+++ b/.github/workflows/cml.yaml
@@ -0,0 +1,63 @@
+name: CML Report
+on: pull_request
+jobs:
+  run:
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: iterative/setup-cml@v2
+      - uses: iterative/setup-dvc@v1
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      # Needed for https://github.com/iterative/example-repos-dev/issues/225
+      - name: Installs JSON5
+        run: npm install -g json5
+      - name: Generate metrics report
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cml ci
+          if [ $GITHUB_REF = refs/heads/main ]; then
+            PREVIOUS_REF=HEAD~1
+          else
+            PREVIOUS_REF=main
+            git fetch origin main:main
+          fi
+
+          dvc pull eval
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets ROC | json5 > vega.json
+          vl2svg vega.json roc.svg
+
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets Precision-Recall | json5 > vega.json
+          vl2svg vega.json prc.svg
+
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets Confusion-Matrix | json5 > vega.json
+          vl2svg vega.json confusion.svg
+
+          cp eval/plots/images/importance.png importance_workspace.png
+
+          git checkout $PREVIOUS_REF -- dvc.lock
+          cp eval/plots/images/importance.png importance_previous.png
+
+          dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
+
+          cat <<EOF > report.md
+          # CML Report
+          ## Plots
+          ![ROC](./roc.svg)
+          ![Precision-Recall](./prc.svg)
+          ![Confusion Matrix](./confusion.svg)
+          #### Feature Importance: ${PREVIOUS_REF}
+          ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
+          #### Feature Importance: workspace
+          ![Feature Importance: workspace](./importance_workspace.png)
+
+          ## Metrics and Params
+          ### ${PREVIOUS_REF} → workspace
+          ${dvc_report}
+          EOF
+
+          cml comment create --publish --pr=false report.md
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+.venv
--- a/params.yaml
+++ b/params.yaml
@@ -0,0 +1,13 @@
+prepare:
+  split: 0.20
+  seed: 20170428
+
+featurize:
+  max_features: 100
+  ngrams: 1
+
+train:
+  seed: 20170428
+  n_est: 50
+  min_split: 0.01
+
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -0,0 +1,112 @@
+import json
+import math
+import os
+import pickle
+import sys
+
+import pandas as pd
+from sklearn import metrics
+from sklearn import tree
+from dvclive import Live
+from matplotlib import pyplot as plt
+
+
+def evaluate(model, matrix, split, live, save_path):
+    """
+    Dump all evaluation metrics and plots for given datasets.
+
+    Args:
+        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
+        matrix (scipy.sparse.csr_matrix): Input matrix.
+        split (str): Dataset name.
+        live (dvclive.Live): Dvclive instance.
+        save_path (str): Path to save the metrics.
+    """
+    labels = matrix[:, 1].toarray().astype(int)
+    x = matrix[:, 2:]
+
+    predictions_by_class = model.predict_proba(x)
+    predictions = predictions_by_class[:, 1]
+
+    # Use dvclive to log a few simple metrics...
+    avg_prec = metrics.average_precision_score(labels, predictions)
+    roc_auc = metrics.roc_auc_score(labels, predictions)
+    if not live.summary:
+        live.summary = {"avg_prec": {}, "roc_auc": {}}
+    live.summary["avg_prec"][split] = avg_prec
+    live.summary["roc_auc"][split] = roc_auc
+
+    # ... and plots...
+    # ... like an roc plot...
+    live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
+    # ... and precision recall plot...
+    # ... which passes `drop_intermediate=True` to the sklearn method...
+    live.log_sklearn_plot(
+        "precision_recall",
+        labels,
+        predictions,
+        name=f"prc/{split}",
+        drop_intermediate=True,
+    )
+    # ... and confusion matrix plot
+    live.log_sklearn_plot(
+        "confusion_matrix",
+        labels.squeeze(),
+        predictions_by_class.argmax(-1),
+        name=f"cm/{split}",
+    )
+
+
+def save_importance_plot(live, model, feature_names):
+    """
+    Save feature importance plot.
+
+    Args:
+        live (dvclive.Live): DVCLive instance.
+        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
+        feature_names (list): List of feature names.
+    """
+    fig, axes = plt.subplots(dpi=100)
+    fig.subplots_adjust(bottom=0.2, top=0.95)
+    axes.set_ylabel("Mean decrease in impurity")
+
+    importances = model.feature_importances_
+    forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
+    forest_importances.plot.bar(ax=axes)
+
+    live.log_image("importance.png", fig)
+
+
+def main():
+    EVAL_PATH = "eval"
+
+    if len(sys.argv) != 3:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython evaluate.py model features\n")
+        sys.exit(1)
+
+    model_file = sys.argv[1]
+    train_file = os.path.join(sys.argv[2], "train.pkl")
+    test_file = os.path.join(sys.argv[2], "test.pkl")
+
+    # Load model and data.
+    with open(model_file, "rb") as fd:
+        model = pickle.load(fd)
+
+    with open(train_file, "rb") as fd:
+        train, feature_names = pickle.load(fd)
+
+    with open(test_file, "rb") as fd:
+        test, _ = pickle.load(fd)
+
+    # Evaluate train and test datasets.
+    with Live(EVAL_PATH) as live:
+        evaluate(model, train, "train", live, save_path=EVAL_PATH)
+        evaluate(model, test, "test", live, save_path=EVAL_PATH)
+
+        # Dump feature importance plot.
+        save_importance_plot(live, model, feature_names)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/featurization.py
+++ b/src/featurization.py
@@ -0,0 +1,136 @@
+import os
+import pickle
+import sys
+
+import numpy as np
+import pandas as pd
+import scipy.sparse as sparse
+import yaml
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+
+
+def get_df(data):
+    """Read the input data file and return a data frame."""
+    df = pd.read_csv(
+        data,
+        encoding="utf-8",
+        header=None,
+        delimiter="\t",
+        names=["id", "label", "text"],
+    )
+    sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
+    return df
+
+
+def save_matrix(df, matrix, names, output):
+    """
+    Save the matrix to a pickle file.
+
+    Args:
+        df (pandas.DataFrame): Input data frame.
+        matrix (scipy.sparse.csr_matrix): Input matrix.
+        names (list): List of feature names.
+        output (str): Output file name.
+    """
+    id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
+    label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
+
+    result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
+
+    msg = "The output matrix {} size is {} and data type is {}\n"
+    sys.stderr.write(msg.format(output, result.shape, result.dtype))
+
+    with open(output, "wb") as fd:
+        pickle.dump((result, names), fd)
+    pass
+
+
+def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
+    """
+    Generate train feature matrix.
+
+    Args:
+        train_input (str): Train input file name.
+        train_output (str): Train output file name.
+        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
+        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
+    """
+    df_train = get_df(train_input)
+    train_words = np.array(df_train.text.str.lower().values)
+
+    bag_of_words.fit(train_words)
+
+    train_words_binary_matrix = bag_of_words.transform(train_words)
+    feature_names = bag_of_words.get_feature_names_out()
+
+    tfidf.fit(train_words_binary_matrix)
+    train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
+
+    save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
+
+
+def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
+    """
+    Generate test feature matrix.
+
+    Args:
+        test_input (str): Test input file name.
+        test_output (str): Test output file name.
+        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
+        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
+    """
+    df_test = get_df(test_input)
+    test_words = np.array(df_test.text.str.lower().values)
+
+    test_words_binary_matrix = bag_of_words.transform(test_words)
+    test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
+    feature_names = bag_of_words.get_feature_names_out()
+
+    save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
+
+
+def main():
+    params = yaml.safe_load(open("params.yaml"))["featurize"]
+
+    np.set_printoptions(suppress=True)
+
+    if len(sys.argv) != 3 and len(sys.argv) != 5:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
+        sys.exit(1)
+
+    in_path = sys.argv[1]
+    out_path = sys.argv[2]
+
+    train_input = os.path.join(in_path, "train.tsv")
+    test_input = os.path.join(in_path, "test.tsv")
+    train_output = os.path.join(out_path, "train.pkl")
+    test_output = os.path.join(out_path, "test.pkl")
+
+    max_features = params["max_features"]
+    ngrams = params["ngrams"]
+
+    os.makedirs(out_path, exist_ok=True)
+
+    bag_of_words = CountVectorizer(
+        stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
+    )
+    tfidf = TfidfTransformer(smooth_idf=False)
+
+    generate_and_save_train_features(
+        train_input=train_input,
+        train_output=train_output,
+        bag_of_words=bag_of_words,
+        tfidf=tfidf,
+    )
+
+    generate_and_save_test_features(
+        test_input=test_input,
+        test_output=test_output,
+        bag_of_words=bag_of_words,
+        tfidf=tfidf,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/src/prepare.py
+++ b/src/prepare.py
@@ -0,0 +1,78 @@
+import os
+import random
+import re
+import sys
+import xml.etree.ElementTree
+
+import yaml
+
+
+def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
+    """
+    Process the input lines and write the output to the output files.
+
+    Args:
+        input_lines (list): List of input lines.
+        fd_out_train (file): Output file for the training data set.
+        fd_out_test (file): Output file for the test data set.
+        target_tag (str): Target tag.
+        split (float): Test data set split ratio.
+    """
+    num = 1
+    for line in input_lines:
+        try:
+            fd_out = fd_out_train if random.random() > split else fd_out_test
+            attr = xml.etree.ElementTree.fromstring(line).attrib
+
+            pid = attr.get("Id", "")
+            label = 1 if target_tag in attr.get("Tags", "") else 0
+            title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
+            body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
+            text = title + " " + body
+
+            fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
+
+            num += 1
+        except Exception as ex:
+            sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
+
+
+def main():
+    params = yaml.safe_load(open("params.yaml"))["prepare"]
+
+    if len(sys.argv) != 2:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython prepare.py data-file\n")
+        sys.exit(1)
+
+    # Test data set split ratio
+    split = params["split"]
+    random.seed(params["seed"])
+
+    input = sys.argv[1]
+    output_train = os.path.join("data", "prepared", "train.tsv")
+    output_test = os.path.join("data", "prepared", "test.tsv")
+
+    os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
+
+    input_lines = []
+    with open(input) as fd_in:
+        input_lines = fd_in.readlines()
+
+    fd_out_train = open(output_train, "w", encoding="utf-8")
+    fd_out_test = open(output_test, "w", encoding="utf-8")
+
+    process_posts(
+        input_lines=input_lines,
+        fd_out_train=fd_out_train,
+        fd_out_test=fd_out_test,
+        target_tag="<r>",
+        split=split,
+    )
+
+    fd_out_train.close()
+    fd_out_test.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -0,0 +1,6 @@
+dvclive>=3.0
+pandas
+pyaml
+scikit-learn>=1.3
+scipy
+matplotlib
--- a/src/train.py
+++ b/src/train.py
@@ -0,0 +1,65 @@
+import os
+import pickle
+import sys
+
+import numpy as np
+import yaml
+from sklearn.ensemble import RandomForestClassifier
+
+
+def train(seed, n_est, min_split, matrix):
+    """
+    Train a random forest classifier.
+
+    Args:
+        seed (int): Random seed.
+        n_est (int): Number of trees in the forest.
+        min_split (int): Minimum number of samples required to split an internal node.
+        matrix (scipy.sparse.csr_matrix): Input matrix.
+
+    Returns:
+        sklearn.ensemble.RandomForestClassifier: Trained classifier.
+    """
+    labels = np.squeeze(matrix[:, 1].toarray())
+    x = matrix[:, 2:]
+
+    sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
+    sys.stderr.write("X matrix size {}\n".format(x.shape))
+    sys.stderr.write("Y matrix size {}\n".format(labels.shape))
+
+    clf = RandomForestClassifier(
+        n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
+    )
+
+    clf.fit(x, labels)
+
+    return clf
+
+
+def main():
+    params = yaml.safe_load(open("params.yaml"))["train"]
+
+    if len(sys.argv) != 3:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython train.py features model\n")
+        sys.exit(1)
+
+    input = sys.argv[1]
+    output = sys.argv[2]
+    seed = params["seed"]
+    n_est = params["n_est"]
+    min_split = params["min_split"]
+
+    # Load the data
+    with open(os.path.join(input, "train.pkl"), "rb") as fd:
+        matrix, _ = pickle.load(fd)
+
+    clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
+
+    # Save the model
+    with open(output, "wb") as fd:
+        pickle.dump(clf, fd)
+
+
+if __name__ == "__main__":
+    main()