Initial commit
This commit is contained in:
@@ -0,0 +1,4 @@
|
|||||||
|
[core]
|
||||||
|
remote = myremote
|
||||||
|
['remote "myremote"']
|
||||||
|
url = /mnt/safe/dvc-remote
|
||||||
|
|||||||
63
.github/workflows/cml.yaml
vendored
Normal file
63
.github/workflows/cml.yaml
vendored
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
name: CML Report
|
||||||
|
on: pull_request
|
||||||
|
jobs:
|
||||||
|
run:
|
||||||
|
runs-on: [ubuntu-latest]
|
||||||
|
steps:
|
||||||
|
- uses: iterative/setup-cml@v2
|
||||||
|
- uses: iterative/setup-dvc@v1
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 2
|
||||||
|
# Needed for https://github.com/iterative/example-repos-dev/issues/225
|
||||||
|
- name: Installs JSON5
|
||||||
|
run: npm install -g json5
|
||||||
|
- name: Generate metrics report
|
||||||
|
env:
|
||||||
|
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
run: |
|
||||||
|
cml ci
|
||||||
|
if [ $GITHUB_REF = refs/heads/main ]; then
|
||||||
|
PREVIOUS_REF=HEAD~1
|
||||||
|
else
|
||||||
|
PREVIOUS_REF=main
|
||||||
|
git fetch origin main:main
|
||||||
|
fi
|
||||||
|
|
||||||
|
dvc pull eval
|
||||||
|
dvc plots diff $PREVIOUS_REF workspace \
|
||||||
|
--show-vega --targets ROC | json5 > vega.json
|
||||||
|
vl2svg vega.json roc.svg
|
||||||
|
|
||||||
|
dvc plots diff $PREVIOUS_REF workspace \
|
||||||
|
--show-vega --targets Precision-Recall | json5 > vega.json
|
||||||
|
vl2svg vega.json prc.svg
|
||||||
|
|
||||||
|
dvc plots diff $PREVIOUS_REF workspace \
|
||||||
|
--show-vega --targets Confusion-Matrix | json5 > vega.json
|
||||||
|
vl2svg vega.json confusion.svg
|
||||||
|
|
||||||
|
cp eval/plots/images/importance.png importance_workspace.png
|
||||||
|
|
||||||
|
git checkout $PREVIOUS_REF -- dvc.lock
|
||||||
|
cp eval/plots/images/importance.png importance_previous.png
|
||||||
|
|
||||||
|
dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
|
||||||
|
|
||||||
|
cat <<EOF > report.md
|
||||||
|
# CML Report
|
||||||
|
## Plots
|
||||||
|

|
||||||
|

|
||||||
|

|
||||||
|
#### Feature Importance: ${PREVIOUS_REF}
|
||||||
|

|
||||||
|
#### Feature Importance: workspace
|
||||||
|

|
||||||
|
|
||||||
|
## Metrics and Params
|
||||||
|
### ${PREVIOUS_REF} → workspace
|
||||||
|
${dvc_report}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cml comment create --publish --pr=false report.md
|
||||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
.venv
|
||||||
13
params.yaml
Normal file
13
params.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
prepare:
|
||||||
|
split: 0.20
|
||||||
|
seed: 20170428
|
||||||
|
|
||||||
|
featurize:
|
||||||
|
max_features: 100
|
||||||
|
ngrams: 1
|
||||||
|
|
||||||
|
train:
|
||||||
|
seed: 20170428
|
||||||
|
n_est: 50
|
||||||
|
min_split: 0.01
|
||||||
|
|
||||||
112
src/evaluate.py
Normal file
112
src/evaluate.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn import metrics
|
||||||
|
from sklearn import tree
|
||||||
|
from dvclive import Live
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(model, matrix, split, live, save_path):
|
||||||
|
"""
|
||||||
|
Dump all evaluation metrics and plots for given datasets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
|
||||||
|
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||||
|
split (str): Dataset name.
|
||||||
|
live (dvclive.Live): Dvclive instance.
|
||||||
|
save_path (str): Path to save the metrics.
|
||||||
|
"""
|
||||||
|
labels = matrix[:, 1].toarray().astype(int)
|
||||||
|
x = matrix[:, 2:]
|
||||||
|
|
||||||
|
predictions_by_class = model.predict_proba(x)
|
||||||
|
predictions = predictions_by_class[:, 1]
|
||||||
|
|
||||||
|
# Use dvclive to log a few simple metrics...
|
||||||
|
avg_prec = metrics.average_precision_score(labels, predictions)
|
||||||
|
roc_auc = metrics.roc_auc_score(labels, predictions)
|
||||||
|
if not live.summary:
|
||||||
|
live.summary = {"avg_prec": {}, "roc_auc": {}}
|
||||||
|
live.summary["avg_prec"][split] = avg_prec
|
||||||
|
live.summary["roc_auc"][split] = roc_auc
|
||||||
|
|
||||||
|
# ... and plots...
|
||||||
|
# ... like an roc plot...
|
||||||
|
live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
|
||||||
|
# ... and precision recall plot...
|
||||||
|
# ... which passes `drop_intermediate=True` to the sklearn method...
|
||||||
|
live.log_sklearn_plot(
|
||||||
|
"precision_recall",
|
||||||
|
labels,
|
||||||
|
predictions,
|
||||||
|
name=f"prc/{split}",
|
||||||
|
drop_intermediate=True,
|
||||||
|
)
|
||||||
|
# ... and confusion matrix plot
|
||||||
|
live.log_sklearn_plot(
|
||||||
|
"confusion_matrix",
|
||||||
|
labels.squeeze(),
|
||||||
|
predictions_by_class.argmax(-1),
|
||||||
|
name=f"cm/{split}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_importance_plot(live, model, feature_names):
|
||||||
|
"""
|
||||||
|
Save feature importance plot.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
live (dvclive.Live): DVCLive instance.
|
||||||
|
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
|
||||||
|
feature_names (list): List of feature names.
|
||||||
|
"""
|
||||||
|
fig, axes = plt.subplots(dpi=100)
|
||||||
|
fig.subplots_adjust(bottom=0.2, top=0.95)
|
||||||
|
axes.set_ylabel("Mean decrease in impurity")
|
||||||
|
|
||||||
|
importances = model.feature_importances_
|
||||||
|
forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
|
||||||
|
forest_importances.plot.bar(ax=axes)
|
||||||
|
|
||||||
|
live.log_image("importance.png", fig)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
EVAL_PATH = "eval"
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
sys.stderr.write("Arguments error. Usage:\n")
|
||||||
|
sys.stderr.write("\tpython evaluate.py model features\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
model_file = sys.argv[1]
|
||||||
|
train_file = os.path.join(sys.argv[2], "train.pkl")
|
||||||
|
test_file = os.path.join(sys.argv[2], "test.pkl")
|
||||||
|
|
||||||
|
# Load model and data.
|
||||||
|
with open(model_file, "rb") as fd:
|
||||||
|
model = pickle.load(fd)
|
||||||
|
|
||||||
|
with open(train_file, "rb") as fd:
|
||||||
|
train, feature_names = pickle.load(fd)
|
||||||
|
|
||||||
|
with open(test_file, "rb") as fd:
|
||||||
|
test, _ = pickle.load(fd)
|
||||||
|
|
||||||
|
# Evaluate train and test datasets.
|
||||||
|
with Live(EVAL_PATH) as live:
|
||||||
|
evaluate(model, train, "train", live, save_path=EVAL_PATH)
|
||||||
|
evaluate(model, test, "test", live, save_path=EVAL_PATH)
|
||||||
|
|
||||||
|
# Dump feature importance plot.
|
||||||
|
save_importance_plot(live, model, feature_names)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
136
src/featurization.py
Normal file
136
src/featurization.py
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import scipy.sparse as sparse
|
||||||
|
import yaml
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
|
|
||||||
|
|
||||||
|
def get_df(data):
|
||||||
|
"""Read the input data file and return a data frame."""
|
||||||
|
df = pd.read_csv(
|
||||||
|
data,
|
||||||
|
encoding="utf-8",
|
||||||
|
header=None,
|
||||||
|
delimiter="\t",
|
||||||
|
names=["id", "label", "text"],
|
||||||
|
)
|
||||||
|
sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def save_matrix(df, matrix, names, output):
|
||||||
|
"""
|
||||||
|
Save the matrix to a pickle file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): Input data frame.
|
||||||
|
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||||
|
names (list): List of feature names.
|
||||||
|
output (str): Output file name.
|
||||||
|
"""
|
||||||
|
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
|
||||||
|
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
|
||||||
|
|
||||||
|
result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
|
||||||
|
|
||||||
|
msg = "The output matrix {} size is {} and data type is {}\n"
|
||||||
|
sys.stderr.write(msg.format(output, result.shape, result.dtype))
|
||||||
|
|
||||||
|
with open(output, "wb") as fd:
|
||||||
|
pickle.dump((result, names), fd)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
|
||||||
|
"""
|
||||||
|
Generate train feature matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
train_input (str): Train input file name.
|
||||||
|
train_output (str): Train output file name.
|
||||||
|
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
|
||||||
|
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
|
||||||
|
"""
|
||||||
|
df_train = get_df(train_input)
|
||||||
|
train_words = np.array(df_train.text.str.lower().values)
|
||||||
|
|
||||||
|
bag_of_words.fit(train_words)
|
||||||
|
|
||||||
|
train_words_binary_matrix = bag_of_words.transform(train_words)
|
||||||
|
feature_names = bag_of_words.get_feature_names_out()
|
||||||
|
|
||||||
|
tfidf.fit(train_words_binary_matrix)
|
||||||
|
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
|
||||||
|
|
||||||
|
save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
|
||||||
|
"""
|
||||||
|
Generate test feature matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_input (str): Test input file name.
|
||||||
|
test_output (str): Test output file name.
|
||||||
|
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
|
||||||
|
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
|
||||||
|
"""
|
||||||
|
df_test = get_df(test_input)
|
||||||
|
test_words = np.array(df_test.text.str.lower().values)
|
||||||
|
|
||||||
|
test_words_binary_matrix = bag_of_words.transform(test_words)
|
||||||
|
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
|
||||||
|
feature_names = bag_of_words.get_feature_names_out()
|
||||||
|
|
||||||
|
save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
params = yaml.safe_load(open("params.yaml"))["featurize"]
|
||||||
|
|
||||||
|
np.set_printoptions(suppress=True)
|
||||||
|
|
||||||
|
if len(sys.argv) != 3 and len(sys.argv) != 5:
|
||||||
|
sys.stderr.write("Arguments error. Usage:\n")
|
||||||
|
sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
in_path = sys.argv[1]
|
||||||
|
out_path = sys.argv[2]
|
||||||
|
|
||||||
|
train_input = os.path.join(in_path, "train.tsv")
|
||||||
|
test_input = os.path.join(in_path, "test.tsv")
|
||||||
|
train_output = os.path.join(out_path, "train.pkl")
|
||||||
|
test_output = os.path.join(out_path, "test.pkl")
|
||||||
|
|
||||||
|
max_features = params["max_features"]
|
||||||
|
ngrams = params["ngrams"]
|
||||||
|
|
||||||
|
os.makedirs(out_path, exist_ok=True)
|
||||||
|
|
||||||
|
bag_of_words = CountVectorizer(
|
||||||
|
stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
|
||||||
|
)
|
||||||
|
tfidf = TfidfTransformer(smooth_idf=False)
|
||||||
|
|
||||||
|
generate_and_save_train_features(
|
||||||
|
train_input=train_input,
|
||||||
|
train_output=train_output,
|
||||||
|
bag_of_words=bag_of_words,
|
||||||
|
tfidf=tfidf,
|
||||||
|
)
|
||||||
|
|
||||||
|
generate_and_save_test_features(
|
||||||
|
test_input=test_input,
|
||||||
|
test_output=test_output,
|
||||||
|
bag_of_words=bag_of_words,
|
||||||
|
tfidf=tfidf,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
78
src/prepare.py
Normal file
78
src/prepare.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import xml.etree.ElementTree
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
|
||||||
|
"""
|
||||||
|
Process the input lines and write the output to the output files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_lines (list): List of input lines.
|
||||||
|
fd_out_train (file): Output file for the training data set.
|
||||||
|
fd_out_test (file): Output file for the test data set.
|
||||||
|
target_tag (str): Target tag.
|
||||||
|
split (float): Test data set split ratio.
|
||||||
|
"""
|
||||||
|
num = 1
|
||||||
|
for line in input_lines:
|
||||||
|
try:
|
||||||
|
fd_out = fd_out_train if random.random() > split else fd_out_test
|
||||||
|
attr = xml.etree.ElementTree.fromstring(line).attrib
|
||||||
|
|
||||||
|
pid = attr.get("Id", "")
|
||||||
|
label = 1 if target_tag in attr.get("Tags", "") else 0
|
||||||
|
title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
|
||||||
|
body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
|
||||||
|
text = title + " " + body
|
||||||
|
|
||||||
|
fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
|
||||||
|
|
||||||
|
num += 1
|
||||||
|
except Exception as ex:
|
||||||
|
sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
params = yaml.safe_load(open("params.yaml"))["prepare"]
|
||||||
|
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
sys.stderr.write("Arguments error. Usage:\n")
|
||||||
|
sys.stderr.write("\tpython prepare.py data-file\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Test data set split ratio
|
||||||
|
split = params["split"]
|
||||||
|
random.seed(params["seed"])
|
||||||
|
|
||||||
|
input = sys.argv[1]
|
||||||
|
output_train = os.path.join("data", "prepared", "train.tsv")
|
||||||
|
output_test = os.path.join("data", "prepared", "test.tsv")
|
||||||
|
|
||||||
|
os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
|
||||||
|
|
||||||
|
input_lines = []
|
||||||
|
with open(input) as fd_in:
|
||||||
|
input_lines = fd_in.readlines()
|
||||||
|
|
||||||
|
fd_out_train = open(output_train, "w", encoding="utf-8")
|
||||||
|
fd_out_test = open(output_test, "w", encoding="utf-8")
|
||||||
|
|
||||||
|
process_posts(
|
||||||
|
input_lines=input_lines,
|
||||||
|
fd_out_train=fd_out_train,
|
||||||
|
fd_out_test=fd_out_test,
|
||||||
|
target_tag="<r>",
|
||||||
|
split=split,
|
||||||
|
)
|
||||||
|
|
||||||
|
fd_out_train.close()
|
||||||
|
fd_out_test.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
6
src/requirements.txt
Normal file
6
src/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
dvclive>=3.0
|
||||||
|
pandas
|
||||||
|
pyaml
|
||||||
|
scikit-learn>=1.3
|
||||||
|
scipy
|
||||||
|
matplotlib
|
||||||
65
src/train.py
Normal file
65
src/train.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import yaml
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
|
||||||
|
|
||||||
|
def train(seed, n_est, min_split, matrix):
|
||||||
|
"""
|
||||||
|
Train a random forest classifier.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed (int): Random seed.
|
||||||
|
n_est (int): Number of trees in the forest.
|
||||||
|
min_split (int): Minimum number of samples required to split an internal node.
|
||||||
|
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
sklearn.ensemble.RandomForestClassifier: Trained classifier.
|
||||||
|
"""
|
||||||
|
labels = np.squeeze(matrix[:, 1].toarray())
|
||||||
|
x = matrix[:, 2:]
|
||||||
|
|
||||||
|
sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
|
||||||
|
sys.stderr.write("X matrix size {}\n".format(x.shape))
|
||||||
|
sys.stderr.write("Y matrix size {}\n".format(labels.shape))
|
||||||
|
|
||||||
|
clf = RandomForestClassifier(
|
||||||
|
n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
|
||||||
|
)
|
||||||
|
|
||||||
|
clf.fit(x, labels)
|
||||||
|
|
||||||
|
return clf
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
params = yaml.safe_load(open("params.yaml"))["train"]
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
sys.stderr.write("Arguments error. Usage:\n")
|
||||||
|
sys.stderr.write("\tpython train.py features model\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input = sys.argv[1]
|
||||||
|
output = sys.argv[2]
|
||||||
|
seed = params["seed"]
|
||||||
|
n_est = params["n_est"]
|
||||||
|
min_split = params["min_split"]
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
with open(os.path.join(input, "train.pkl"), "rb") as fd:
|
||||||
|
matrix, _ = pickle.load(fd)
|
||||||
|
|
||||||
|
clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
|
||||||
|
|
||||||
|
# Save the model
|
||||||
|
with open(output, "wb") as fd:
|
||||||
|
pickle.dump(clf, fd)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user