diff --git a/.dvc/config b/.dvc/config index e69de29..c7e09ba 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,4 @@ +[core] + remote = myremote +['remote "myremote"'] + url = /mnt/safe/dvc-remote diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml new file mode 100644 index 0000000..3536e22 --- /dev/null +++ b/.github/workflows/cml.yaml @@ -0,0 +1,63 @@ +name: CML Report +on: pull_request +jobs: + run: + runs-on: [ubuntu-latest] + steps: + - uses: iterative/setup-cml@v2 + - uses: iterative/setup-dvc@v1 + - uses: actions/checkout@v3 + with: + fetch-depth: 2 + # Needed for https://github.com/iterative/example-repos-dev/issues/225 + - name: Installs JSON5 + run: npm install -g json5 + - name: Generate metrics report + env: + REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + cml ci + if [ $GITHUB_REF = refs/heads/main ]; then + PREVIOUS_REF=HEAD~1 + else + PREVIOUS_REF=main + git fetch origin main:main + fi + + dvc pull eval + dvc plots diff $PREVIOUS_REF workspace \ + --show-vega --targets ROC | json5 > vega.json + vl2svg vega.json roc.svg + + dvc plots diff $PREVIOUS_REF workspace \ + --show-vega --targets Precision-Recall | json5 > vega.json + vl2svg vega.json prc.svg + + dvc plots diff $PREVIOUS_REF workspace \ + --show-vega --targets Confusion-Matrix | json5 > vega.json + vl2svg vega.json confusion.svg + + cp eval/plots/images/importance.png importance_workspace.png + + git checkout $PREVIOUS_REF -- dvc.lock + cp eval/plots/images/importance.png importance_previous.png + + dvc_report=$(dvc exp diff $PREVIOUS_REF --md) + + cat < report.md + # CML Report + ## Plots + ![ROC](./roc.svg) + ![Precision-Recall](./prc.svg) + ![Confusion Matrix](./confusion.svg) + #### Feature Importance: ${PREVIOUS_REF} + ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png) + #### Feature Importance: workspace + ![Feature Importance: workspace](./importance_workspace.png) + + ## Metrics and Params + ### ${PREVIOUS_REF} → workspace + ${dvc_report} + EOF + + cml comment create --publish --pr=false report.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d17dae --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..762d348 --- /dev/null +++ b/params.yaml @@ -0,0 +1,13 @@ +prepare: + split: 0.20 + seed: 20170428 + +featurize: + max_features: 100 + ngrams: 1 + +train: + seed: 20170428 + n_est: 50 + min_split: 0.01 + diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..1c0d95f --- /dev/null +++ b/src/evaluate.py @@ -0,0 +1,112 @@ +import json +import math +import os +import pickle +import sys + +import pandas as pd +from sklearn import metrics +from sklearn import tree +from dvclive import Live +from matplotlib import pyplot as plt + + +def evaluate(model, matrix, split, live, save_path): + """ + Dump all evaluation metrics and plots for given datasets. + + Args: + model (sklearn.ensemble.RandomForestClassifier): Trained classifier. + matrix (scipy.sparse.csr_matrix): Input matrix. + split (str): Dataset name. + live (dvclive.Live): Dvclive instance. + save_path (str): Path to save the metrics. + """ + labels = matrix[:, 1].toarray().astype(int) + x = matrix[:, 2:] + + predictions_by_class = model.predict_proba(x) + predictions = predictions_by_class[:, 1] + + # Use dvclive to log a few simple metrics... + avg_prec = metrics.average_precision_score(labels, predictions) + roc_auc = metrics.roc_auc_score(labels, predictions) + if not live.summary: + live.summary = {"avg_prec": {}, "roc_auc": {}} + live.summary["avg_prec"][split] = avg_prec + live.summary["roc_auc"][split] = roc_auc + + # ... and plots... + # ... like an roc plot... + live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}") + # ... and precision recall plot... + # ... which passes `drop_intermediate=True` to the sklearn method... + live.log_sklearn_plot( + "precision_recall", + labels, + predictions, + name=f"prc/{split}", + drop_intermediate=True, + ) + # ... and confusion matrix plot + live.log_sklearn_plot( + "confusion_matrix", + labels.squeeze(), + predictions_by_class.argmax(-1), + name=f"cm/{split}", + ) + + +def save_importance_plot(live, model, feature_names): + """ + Save feature importance plot. + + Args: + live (dvclive.Live): DVCLive instance. + model (sklearn.ensemble.RandomForestClassifier): Trained classifier. + feature_names (list): List of feature names. + """ + fig, axes = plt.subplots(dpi=100) + fig.subplots_adjust(bottom=0.2, top=0.95) + axes.set_ylabel("Mean decrease in impurity") + + importances = model.feature_importances_ + forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30) + forest_importances.plot.bar(ax=axes) + + live.log_image("importance.png", fig) + + +def main(): + EVAL_PATH = "eval" + + if len(sys.argv) != 3: + sys.stderr.write("Arguments error. Usage:\n") + sys.stderr.write("\tpython evaluate.py model features\n") + sys.exit(1) + + model_file = sys.argv[1] + train_file = os.path.join(sys.argv[2], "train.pkl") + test_file = os.path.join(sys.argv[2], "test.pkl") + + # Load model and data. + with open(model_file, "rb") as fd: + model = pickle.load(fd) + + with open(train_file, "rb") as fd: + train, feature_names = pickle.load(fd) + + with open(test_file, "rb") as fd: + test, _ = pickle.load(fd) + + # Evaluate train and test datasets. + with Live(EVAL_PATH) as live: + evaluate(model, train, "train", live, save_path=EVAL_PATH) + evaluate(model, test, "test", live, save_path=EVAL_PATH) + + # Dump feature importance plot. + save_importance_plot(live, model, feature_names) + + +if __name__ == "__main__": + main() diff --git a/src/featurization.py b/src/featurization.py new file mode 100644 index 0000000..9f49304 --- /dev/null +++ b/src/featurization.py @@ -0,0 +1,136 @@ +import os +import pickle +import sys + +import numpy as np +import pandas as pd +import scipy.sparse as sparse +import yaml +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer + + +def get_df(data): + """Read the input data file and return a data frame.""" + df = pd.read_csv( + data, + encoding="utf-8", + header=None, + delimiter="\t", + names=["id", "label", "text"], + ) + sys.stderr.write(f"The input data frame {data} size is {df.shape}\n") + return df + + +def save_matrix(df, matrix, names, output): + """ + Save the matrix to a pickle file. + + Args: + df (pandas.DataFrame): Input data frame. + matrix (scipy.sparse.csr_matrix): Input matrix. + names (list): List of feature names. + output (str): Output file name. + """ + id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T + label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T + + result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr") + + msg = "The output matrix {} size is {} and data type is {}\n" + sys.stderr.write(msg.format(output, result.shape, result.dtype)) + + with open(output, "wb") as fd: + pickle.dump((result, names), fd) + pass + + +def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf): + """ + Generate train feature matrix. + + Args: + train_input (str): Train input file name. + train_output (str): Train output file name. + bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. + tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. + """ + df_train = get_df(train_input) + train_words = np.array(df_train.text.str.lower().values) + + bag_of_words.fit(train_words) + + train_words_binary_matrix = bag_of_words.transform(train_words) + feature_names = bag_of_words.get_feature_names_out() + + tfidf.fit(train_words_binary_matrix) + train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) + + save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output) + + +def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf): + """ + Generate test feature matrix. + + Args: + test_input (str): Test input file name. + test_output (str): Test output file name. + bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. + tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. + """ + df_test = get_df(test_input) + test_words = np.array(df_test.text.str.lower().values) + + test_words_binary_matrix = bag_of_words.transform(test_words) + test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) + feature_names = bag_of_words.get_feature_names_out() + + save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output) + + +def main(): + params = yaml.safe_load(open("params.yaml"))["featurize"] + + np.set_printoptions(suppress=True) + + if len(sys.argv) != 3 and len(sys.argv) != 5: + sys.stderr.write("Arguments error. Usage:\n") + sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n") + sys.exit(1) + + in_path = sys.argv[1] + out_path = sys.argv[2] + + train_input = os.path.join(in_path, "train.tsv") + test_input = os.path.join(in_path, "test.tsv") + train_output = os.path.join(out_path, "train.pkl") + test_output = os.path.join(out_path, "test.pkl") + + max_features = params["max_features"] + ngrams = params["ngrams"] + + os.makedirs(out_path, exist_ok=True) + + bag_of_words = CountVectorizer( + stop_words="english", max_features=max_features, ngram_range=(1, ngrams) + ) + tfidf = TfidfTransformer(smooth_idf=False) + + generate_and_save_train_features( + train_input=train_input, + train_output=train_output, + bag_of_words=bag_of_words, + tfidf=tfidf, + ) + + generate_and_save_test_features( + test_input=test_input, + test_output=test_output, + bag_of_words=bag_of_words, + tfidf=tfidf, + ) + + +if __name__ == "__main__": + main() diff --git a/src/prepare.py b/src/prepare.py new file mode 100644 index 0000000..e6b3a2c --- /dev/null +++ b/src/prepare.py @@ -0,0 +1,78 @@ +import os +import random +import re +import sys +import xml.etree.ElementTree + +import yaml + + +def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split): + """ + Process the input lines and write the output to the output files. + + Args: + input_lines (list): List of input lines. + fd_out_train (file): Output file for the training data set. + fd_out_test (file): Output file for the test data set. + target_tag (str): Target tag. + split (float): Test data set split ratio. + """ + num = 1 + for line in input_lines: + try: + fd_out = fd_out_train if random.random() > split else fd_out_test + attr = xml.etree.ElementTree.fromstring(line).attrib + + pid = attr.get("Id", "") + label = 1 if target_tag in attr.get("Tags", "") else 0 + title = re.sub(r"\s+", " ", attr.get("Title", "")).strip() + body = re.sub(r"\s+", " ", attr.get("Body", "")).strip() + text = title + " " + body + + fd_out.write("{}\t{}\t{}\n".format(pid, label, text)) + + num += 1 + except Exception as ex: + sys.stderr.write(f"Skipping the broken line {num}: {ex}\n") + + +def main(): + params = yaml.safe_load(open("params.yaml"))["prepare"] + + if len(sys.argv) != 2: + sys.stderr.write("Arguments error. Usage:\n") + sys.stderr.write("\tpython prepare.py data-file\n") + sys.exit(1) + + # Test data set split ratio + split = params["split"] + random.seed(params["seed"]) + + input = sys.argv[1] + output_train = os.path.join("data", "prepared", "train.tsv") + output_test = os.path.join("data", "prepared", "test.tsv") + + os.makedirs(os.path.join("data", "prepared"), exist_ok=True) + + input_lines = [] + with open(input) as fd_in: + input_lines = fd_in.readlines() + + fd_out_train = open(output_train, "w", encoding="utf-8") + fd_out_test = open(output_test, "w", encoding="utf-8") + + process_posts( + input_lines=input_lines, + fd_out_train=fd_out_train, + fd_out_test=fd_out_test, + target_tag="", + split=split, + ) + + fd_out_train.close() + fd_out_test.close() + + +if __name__ == "__main__": + main() diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..e72d1a3 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,6 @@ +dvclive>=3.0 +pandas +pyaml +scikit-learn>=1.3 +scipy +matplotlib diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..3a47645 --- /dev/null +++ b/src/train.py @@ -0,0 +1,65 @@ +import os +import pickle +import sys + +import numpy as np +import yaml +from sklearn.ensemble import RandomForestClassifier + + +def train(seed, n_est, min_split, matrix): + """ + Train a random forest classifier. + + Args: + seed (int): Random seed. + n_est (int): Number of trees in the forest. + min_split (int): Minimum number of samples required to split an internal node. + matrix (scipy.sparse.csr_matrix): Input matrix. + + Returns: + sklearn.ensemble.RandomForestClassifier: Trained classifier. + """ + labels = np.squeeze(matrix[:, 1].toarray()) + x = matrix[:, 2:] + + sys.stderr.write("Input matrix size {}\n".format(matrix.shape)) + sys.stderr.write("X matrix size {}\n".format(x.shape)) + sys.stderr.write("Y matrix size {}\n".format(labels.shape)) + + clf = RandomForestClassifier( + n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed + ) + + clf.fit(x, labels) + + return clf + + +def main(): + params = yaml.safe_load(open("params.yaml"))["train"] + + if len(sys.argv) != 3: + sys.stderr.write("Arguments error. Usage:\n") + sys.stderr.write("\tpython train.py features model\n") + sys.exit(1) + + input = sys.argv[1] + output = sys.argv[2] + seed = params["seed"] + n_est = params["n_est"] + min_split = params["min_split"] + + # Load the data + with open(os.path.join(input, "train.pkl"), "rb") as fd: + matrix, _ = pickle.load(fd) + + clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix) + + # Save the model + with open(output, "wb") as fd: + pickle.dump(clf, fd) + + +if __name__ == "__main__": + main()