Initial commit

This commit is contained in:
2026-03-14 21:31:00 +08:00
parent 58dfd5548d
commit b77c37cfa5
9 changed files with 478 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
[core]
remote = myremote
['remote "myremote"']
url = /mnt/safe/dvc-remote

63
.github/workflows/cml.yaml vendored Normal file
View File

@@ -0,0 +1,63 @@
name: CML Report
on: pull_request
jobs:
run:
runs-on: [ubuntu-latest]
steps:
- uses: iterative/setup-cml@v2
- uses: iterative/setup-dvc@v1
- uses: actions/checkout@v3
with:
fetch-depth: 2
# Needed for https://github.com/iterative/example-repos-dev/issues/225
- name: Installs JSON5
run: npm install -g json5
- name: Generate metrics report
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cml ci
if [ $GITHUB_REF = refs/heads/main ]; then
PREVIOUS_REF=HEAD~1
else
PREVIOUS_REF=main
git fetch origin main:main
fi
dvc pull eval
dvc plots diff $PREVIOUS_REF workspace \
--show-vega --targets ROC | json5 > vega.json
vl2svg vega.json roc.svg
dvc plots diff $PREVIOUS_REF workspace \
--show-vega --targets Precision-Recall | json5 > vega.json
vl2svg vega.json prc.svg
dvc plots diff $PREVIOUS_REF workspace \
--show-vega --targets Confusion-Matrix | json5 > vega.json
vl2svg vega.json confusion.svg
cp eval/plots/images/importance.png importance_workspace.png
git checkout $PREVIOUS_REF -- dvc.lock
cp eval/plots/images/importance.png importance_previous.png
dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
cat <<EOF > report.md
# CML Report
## Plots
![ROC](./roc.svg)
![Precision-Recall](./prc.svg)
![Confusion Matrix](./confusion.svg)
#### Feature Importance: ${PREVIOUS_REF}
![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
#### Feature Importance: workspace
![Feature Importance: workspace](./importance_workspace.png)
## Metrics and Params
### ${PREVIOUS_REF} → workspace
${dvc_report}
EOF
cml comment create --publish --pr=false report.md

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
.venv

13
params.yaml Normal file
View File

@@ -0,0 +1,13 @@
prepare:
split: 0.20
seed: 20170428
featurize:
max_features: 100
ngrams: 1
train:
seed: 20170428
n_est: 50
min_split: 0.01

112
src/evaluate.py Normal file
View File

@@ -0,0 +1,112 @@
import json
import math
import os
import pickle
import sys
import pandas as pd
from sklearn import metrics
from sklearn import tree
from dvclive import Live
from matplotlib import pyplot as plt
def evaluate(model, matrix, split, live, save_path):
"""
Dump all evaluation metrics and plots for given datasets.
Args:
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
matrix (scipy.sparse.csr_matrix): Input matrix.
split (str): Dataset name.
live (dvclive.Live): Dvclive instance.
save_path (str): Path to save the metrics.
"""
labels = matrix[:, 1].toarray().astype(int)
x = matrix[:, 2:]
predictions_by_class = model.predict_proba(x)
predictions = predictions_by_class[:, 1]
# Use dvclive to log a few simple metrics...
avg_prec = metrics.average_precision_score(labels, predictions)
roc_auc = metrics.roc_auc_score(labels, predictions)
if not live.summary:
live.summary = {"avg_prec": {}, "roc_auc": {}}
live.summary["avg_prec"][split] = avg_prec
live.summary["roc_auc"][split] = roc_auc
# ... and plots...
# ... like an roc plot...
live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
# ... and precision recall plot...
# ... which passes `drop_intermediate=True` to the sklearn method...
live.log_sklearn_plot(
"precision_recall",
labels,
predictions,
name=f"prc/{split}",
drop_intermediate=True,
)
# ... and confusion matrix plot
live.log_sklearn_plot(
"confusion_matrix",
labels.squeeze(),
predictions_by_class.argmax(-1),
name=f"cm/{split}",
)
def save_importance_plot(live, model, feature_names):
"""
Save feature importance plot.
Args:
live (dvclive.Live): DVCLive instance.
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
feature_names (list): List of feature names.
"""
fig, axes = plt.subplots(dpi=100)
fig.subplots_adjust(bottom=0.2, top=0.95)
axes.set_ylabel("Mean decrease in impurity")
importances = model.feature_importances_
forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
forest_importances.plot.bar(ax=axes)
live.log_image("importance.png", fig)
def main():
EVAL_PATH = "eval"
if len(sys.argv) != 3:
sys.stderr.write("Arguments error. Usage:\n")
sys.stderr.write("\tpython evaluate.py model features\n")
sys.exit(1)
model_file = sys.argv[1]
train_file = os.path.join(sys.argv[2], "train.pkl")
test_file = os.path.join(sys.argv[2], "test.pkl")
# Load model and data.
with open(model_file, "rb") as fd:
model = pickle.load(fd)
with open(train_file, "rb") as fd:
train, feature_names = pickle.load(fd)
with open(test_file, "rb") as fd:
test, _ = pickle.load(fd)
# Evaluate train and test datasets.
with Live(EVAL_PATH) as live:
evaluate(model, train, "train", live, save_path=EVAL_PATH)
evaluate(model, test, "test", live, save_path=EVAL_PATH)
# Dump feature importance plot.
save_importance_plot(live, model, feature_names)
if __name__ == "__main__":
main()

136
src/featurization.py Normal file
View File

@@ -0,0 +1,136 @@
import os
import pickle
import sys
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import yaml
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
def get_df(data):
"""Read the input data file and return a data frame."""
df = pd.read_csv(
data,
encoding="utf-8",
header=None,
delimiter="\t",
names=["id", "label", "text"],
)
sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
return df
def save_matrix(df, matrix, names, output):
"""
Save the matrix to a pickle file.
Args:
df (pandas.DataFrame): Input data frame.
matrix (scipy.sparse.csr_matrix): Input matrix.
names (list): List of feature names.
output (str): Output file name.
"""
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
msg = "The output matrix {} size is {} and data type is {}\n"
sys.stderr.write(msg.format(output, result.shape, result.dtype))
with open(output, "wb") as fd:
pickle.dump((result, names), fd)
pass
def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
"""
Generate train feature matrix.
Args:
train_input (str): Train input file name.
train_output (str): Train output file name.
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
"""
df_train = get_df(train_input)
train_words = np.array(df_train.text.str.lower().values)
bag_of_words.fit(train_words)
train_words_binary_matrix = bag_of_words.transform(train_words)
feature_names = bag_of_words.get_feature_names_out()
tfidf.fit(train_words_binary_matrix)
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
"""
Generate test feature matrix.
Args:
test_input (str): Test input file name.
test_output (str): Test output file name.
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
"""
df_test = get_df(test_input)
test_words = np.array(df_test.text.str.lower().values)
test_words_binary_matrix = bag_of_words.transform(test_words)
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
feature_names = bag_of_words.get_feature_names_out()
save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
def main():
params = yaml.safe_load(open("params.yaml"))["featurize"]
np.set_printoptions(suppress=True)
if len(sys.argv) != 3 and len(sys.argv) != 5:
sys.stderr.write("Arguments error. Usage:\n")
sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
sys.exit(1)
in_path = sys.argv[1]
out_path = sys.argv[2]
train_input = os.path.join(in_path, "train.tsv")
test_input = os.path.join(in_path, "test.tsv")
train_output = os.path.join(out_path, "train.pkl")
test_output = os.path.join(out_path, "test.pkl")
max_features = params["max_features"]
ngrams = params["ngrams"]
os.makedirs(out_path, exist_ok=True)
bag_of_words = CountVectorizer(
stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
)
tfidf = TfidfTransformer(smooth_idf=False)
generate_and_save_train_features(
train_input=train_input,
train_output=train_output,
bag_of_words=bag_of_words,
tfidf=tfidf,
)
generate_and_save_test_features(
test_input=test_input,
test_output=test_output,
bag_of_words=bag_of_words,
tfidf=tfidf,
)
if __name__ == "__main__":
main()

78
src/prepare.py Normal file
View File

@@ -0,0 +1,78 @@
import os
import random
import re
import sys
import xml.etree.ElementTree
import yaml
def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
"""
Process the input lines and write the output to the output files.
Args:
input_lines (list): List of input lines.
fd_out_train (file): Output file for the training data set.
fd_out_test (file): Output file for the test data set.
target_tag (str): Target tag.
split (float): Test data set split ratio.
"""
num = 1
for line in input_lines:
try:
fd_out = fd_out_train if random.random() > split else fd_out_test
attr = xml.etree.ElementTree.fromstring(line).attrib
pid = attr.get("Id", "")
label = 1 if target_tag in attr.get("Tags", "") else 0
title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
text = title + " " + body
fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
num += 1
except Exception as ex:
sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
def main():
params = yaml.safe_load(open("params.yaml"))["prepare"]
if len(sys.argv) != 2:
sys.stderr.write("Arguments error. Usage:\n")
sys.stderr.write("\tpython prepare.py data-file\n")
sys.exit(1)
# Test data set split ratio
split = params["split"]
random.seed(params["seed"])
input = sys.argv[1]
output_train = os.path.join("data", "prepared", "train.tsv")
output_test = os.path.join("data", "prepared", "test.tsv")
os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
input_lines = []
with open(input) as fd_in:
input_lines = fd_in.readlines()
fd_out_train = open(output_train, "w", encoding="utf-8")
fd_out_test = open(output_test, "w", encoding="utf-8")
process_posts(
input_lines=input_lines,
fd_out_train=fd_out_train,
fd_out_test=fd_out_test,
target_tag="<r>",
split=split,
)
fd_out_train.close()
fd_out_test.close()
if __name__ == "__main__":
main()

6
src/requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
dvclive>=3.0
pandas
pyaml
scikit-learn>=1.3
scipy
matplotlib

65
src/train.py Normal file
View File

@@ -0,0 +1,65 @@
import os
import pickle
import sys
import numpy as np
import yaml
from sklearn.ensemble import RandomForestClassifier
def train(seed, n_est, min_split, matrix):
"""
Train a random forest classifier.
Args:
seed (int): Random seed.
n_est (int): Number of trees in the forest.
min_split (int): Minimum number of samples required to split an internal node.
matrix (scipy.sparse.csr_matrix): Input matrix.
Returns:
sklearn.ensemble.RandomForestClassifier: Trained classifier.
"""
labels = np.squeeze(matrix[:, 1].toarray())
x = matrix[:, 2:]
sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
sys.stderr.write("X matrix size {}\n".format(x.shape))
sys.stderr.write("Y matrix size {}\n".format(labels.shape))
clf = RandomForestClassifier(
n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
)
clf.fit(x, labels)
return clf
def main():
params = yaml.safe_load(open("params.yaml"))["train"]
if len(sys.argv) != 3:
sys.stderr.write("Arguments error. Usage:\n")
sys.stderr.write("\tpython train.py features model\n")
sys.exit(1)
input = sys.argv[1]
output = sys.argv[2]
seed = params["seed"]
n_est = params["n_est"]
min_split = params["min_split"]
# Load the data
with open(os.path.join(input, "train.pkl"), "rb") as fd:
matrix, _ = pickle.load(fd)
clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
# Save the model
with open(output, "wb") as fd:
pickle.dump(clf, fd)
if __name__ == "__main__":
main()