Initial commit
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
[core]
|
||||
remote = myremote
|
||||
['remote "myremote"']
|
||||
url = /mnt/safe/dvc-remote
|
||||
|
||||
63
.github/workflows/cml.yaml
vendored
Normal file
63
.github/workflows/cml.yaml
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
name: CML Report
|
||||
on: pull_request
|
||||
jobs:
|
||||
run:
|
||||
runs-on: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: iterative/setup-cml@v2
|
||||
- uses: iterative/setup-dvc@v1
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
# Needed for https://github.com/iterative/example-repos-dev/issues/225
|
||||
- name: Installs JSON5
|
||||
run: npm install -g json5
|
||||
- name: Generate metrics report
|
||||
env:
|
||||
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
cml ci
|
||||
if [ $GITHUB_REF = refs/heads/main ]; then
|
||||
PREVIOUS_REF=HEAD~1
|
||||
else
|
||||
PREVIOUS_REF=main
|
||||
git fetch origin main:main
|
||||
fi
|
||||
|
||||
dvc pull eval
|
||||
dvc plots diff $PREVIOUS_REF workspace \
|
||||
--show-vega --targets ROC | json5 > vega.json
|
||||
vl2svg vega.json roc.svg
|
||||
|
||||
dvc plots diff $PREVIOUS_REF workspace \
|
||||
--show-vega --targets Precision-Recall | json5 > vega.json
|
||||
vl2svg vega.json prc.svg
|
||||
|
||||
dvc plots diff $PREVIOUS_REF workspace \
|
||||
--show-vega --targets Confusion-Matrix | json5 > vega.json
|
||||
vl2svg vega.json confusion.svg
|
||||
|
||||
cp eval/plots/images/importance.png importance_workspace.png
|
||||
|
||||
git checkout $PREVIOUS_REF -- dvc.lock
|
||||
cp eval/plots/images/importance.png importance_previous.png
|
||||
|
||||
dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
|
||||
|
||||
cat <<EOF > report.md
|
||||
# CML Report
|
||||
## Plots
|
||||

|
||||

|
||||

|
||||
#### Feature Importance: ${PREVIOUS_REF}
|
||||

|
||||
#### Feature Importance: workspace
|
||||

|
||||
|
||||
## Metrics and Params
|
||||
### ${PREVIOUS_REF} → workspace
|
||||
${dvc_report}
|
||||
EOF
|
||||
|
||||
cml comment create --publish --pr=false report.md
|
||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
.venv
|
||||
13
params.yaml
Normal file
13
params.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
prepare:
|
||||
split: 0.20
|
||||
seed: 20170428
|
||||
|
||||
featurize:
|
||||
max_features: 100
|
||||
ngrams: 1
|
||||
|
||||
train:
|
||||
seed: 20170428
|
||||
n_est: 50
|
||||
min_split: 0.01
|
||||
|
||||
112
src/evaluate.py
Normal file
112
src/evaluate.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from sklearn import metrics
|
||||
from sklearn import tree
|
||||
from dvclive import Live
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def evaluate(model, matrix, split, live, save_path):
|
||||
"""
|
||||
Dump all evaluation metrics and plots for given datasets.
|
||||
|
||||
Args:
|
||||
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
|
||||
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||
split (str): Dataset name.
|
||||
live (dvclive.Live): Dvclive instance.
|
||||
save_path (str): Path to save the metrics.
|
||||
"""
|
||||
labels = matrix[:, 1].toarray().astype(int)
|
||||
x = matrix[:, 2:]
|
||||
|
||||
predictions_by_class = model.predict_proba(x)
|
||||
predictions = predictions_by_class[:, 1]
|
||||
|
||||
# Use dvclive to log a few simple metrics...
|
||||
avg_prec = metrics.average_precision_score(labels, predictions)
|
||||
roc_auc = metrics.roc_auc_score(labels, predictions)
|
||||
if not live.summary:
|
||||
live.summary = {"avg_prec": {}, "roc_auc": {}}
|
||||
live.summary["avg_prec"][split] = avg_prec
|
||||
live.summary["roc_auc"][split] = roc_auc
|
||||
|
||||
# ... and plots...
|
||||
# ... like an roc plot...
|
||||
live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
|
||||
# ... and precision recall plot...
|
||||
# ... which passes `drop_intermediate=True` to the sklearn method...
|
||||
live.log_sklearn_plot(
|
||||
"precision_recall",
|
||||
labels,
|
||||
predictions,
|
||||
name=f"prc/{split}",
|
||||
drop_intermediate=True,
|
||||
)
|
||||
# ... and confusion matrix plot
|
||||
live.log_sklearn_plot(
|
||||
"confusion_matrix",
|
||||
labels.squeeze(),
|
||||
predictions_by_class.argmax(-1),
|
||||
name=f"cm/{split}",
|
||||
)
|
||||
|
||||
|
||||
def save_importance_plot(live, model, feature_names):
|
||||
"""
|
||||
Save feature importance plot.
|
||||
|
||||
Args:
|
||||
live (dvclive.Live): DVCLive instance.
|
||||
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
|
||||
feature_names (list): List of feature names.
|
||||
"""
|
||||
fig, axes = plt.subplots(dpi=100)
|
||||
fig.subplots_adjust(bottom=0.2, top=0.95)
|
||||
axes.set_ylabel("Mean decrease in impurity")
|
||||
|
||||
importances = model.feature_importances_
|
||||
forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
|
||||
forest_importances.plot.bar(ax=axes)
|
||||
|
||||
live.log_image("importance.png", fig)
|
||||
|
||||
|
||||
def main():
|
||||
EVAL_PATH = "eval"
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
sys.stderr.write("Arguments error. Usage:\n")
|
||||
sys.stderr.write("\tpython evaluate.py model features\n")
|
||||
sys.exit(1)
|
||||
|
||||
model_file = sys.argv[1]
|
||||
train_file = os.path.join(sys.argv[2], "train.pkl")
|
||||
test_file = os.path.join(sys.argv[2], "test.pkl")
|
||||
|
||||
# Load model and data.
|
||||
with open(model_file, "rb") as fd:
|
||||
model = pickle.load(fd)
|
||||
|
||||
with open(train_file, "rb") as fd:
|
||||
train, feature_names = pickle.load(fd)
|
||||
|
||||
with open(test_file, "rb") as fd:
|
||||
test, _ = pickle.load(fd)
|
||||
|
||||
# Evaluate train and test datasets.
|
||||
with Live(EVAL_PATH) as live:
|
||||
evaluate(model, train, "train", live, save_path=EVAL_PATH)
|
||||
evaluate(model, test, "test", live, save_path=EVAL_PATH)
|
||||
|
||||
# Dump feature importance plot.
|
||||
save_importance_plot(live, model, feature_names)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
136
src/featurization.py
Normal file
136
src/featurization.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy.sparse as sparse
|
||||
import yaml
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||
|
||||
|
||||
def get_df(data):
|
||||
"""Read the input data file and return a data frame."""
|
||||
df = pd.read_csv(
|
||||
data,
|
||||
encoding="utf-8",
|
||||
header=None,
|
||||
delimiter="\t",
|
||||
names=["id", "label", "text"],
|
||||
)
|
||||
sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
|
||||
return df
|
||||
|
||||
|
||||
def save_matrix(df, matrix, names, output):
|
||||
"""
|
||||
Save the matrix to a pickle file.
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): Input data frame.
|
||||
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||
names (list): List of feature names.
|
||||
output (str): Output file name.
|
||||
"""
|
||||
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
|
||||
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
|
||||
|
||||
result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
|
||||
|
||||
msg = "The output matrix {} size is {} and data type is {}\n"
|
||||
sys.stderr.write(msg.format(output, result.shape, result.dtype))
|
||||
|
||||
with open(output, "wb") as fd:
|
||||
pickle.dump((result, names), fd)
|
||||
pass
|
||||
|
||||
|
||||
def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
|
||||
"""
|
||||
Generate train feature matrix.
|
||||
|
||||
Args:
|
||||
train_input (str): Train input file name.
|
||||
train_output (str): Train output file name.
|
||||
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
|
||||
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
|
||||
"""
|
||||
df_train = get_df(train_input)
|
||||
train_words = np.array(df_train.text.str.lower().values)
|
||||
|
||||
bag_of_words.fit(train_words)
|
||||
|
||||
train_words_binary_matrix = bag_of_words.transform(train_words)
|
||||
feature_names = bag_of_words.get_feature_names_out()
|
||||
|
||||
tfidf.fit(train_words_binary_matrix)
|
||||
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
|
||||
|
||||
save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
|
||||
|
||||
|
||||
def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
|
||||
"""
|
||||
Generate test feature matrix.
|
||||
|
||||
Args:
|
||||
test_input (str): Test input file name.
|
||||
test_output (str): Test output file name.
|
||||
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
|
||||
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
|
||||
"""
|
||||
df_test = get_df(test_input)
|
||||
test_words = np.array(df_test.text.str.lower().values)
|
||||
|
||||
test_words_binary_matrix = bag_of_words.transform(test_words)
|
||||
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
|
||||
feature_names = bag_of_words.get_feature_names_out()
|
||||
|
||||
save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
|
||||
|
||||
|
||||
def main():
|
||||
params = yaml.safe_load(open("params.yaml"))["featurize"]
|
||||
|
||||
np.set_printoptions(suppress=True)
|
||||
|
||||
if len(sys.argv) != 3 and len(sys.argv) != 5:
|
||||
sys.stderr.write("Arguments error. Usage:\n")
|
||||
sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
|
||||
sys.exit(1)
|
||||
|
||||
in_path = sys.argv[1]
|
||||
out_path = sys.argv[2]
|
||||
|
||||
train_input = os.path.join(in_path, "train.tsv")
|
||||
test_input = os.path.join(in_path, "test.tsv")
|
||||
train_output = os.path.join(out_path, "train.pkl")
|
||||
test_output = os.path.join(out_path, "test.pkl")
|
||||
|
||||
max_features = params["max_features"]
|
||||
ngrams = params["ngrams"]
|
||||
|
||||
os.makedirs(out_path, exist_ok=True)
|
||||
|
||||
bag_of_words = CountVectorizer(
|
||||
stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
|
||||
)
|
||||
tfidf = TfidfTransformer(smooth_idf=False)
|
||||
|
||||
generate_and_save_train_features(
|
||||
train_input=train_input,
|
||||
train_output=train_output,
|
||||
bag_of_words=bag_of_words,
|
||||
tfidf=tfidf,
|
||||
)
|
||||
|
||||
generate_and_save_test_features(
|
||||
test_input=test_input,
|
||||
test_output=test_output,
|
||||
bag_of_words=bag_of_words,
|
||||
tfidf=tfidf,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78
src/prepare.py
Normal file
78
src/prepare.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.ElementTree
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
|
||||
"""
|
||||
Process the input lines and write the output to the output files.
|
||||
|
||||
Args:
|
||||
input_lines (list): List of input lines.
|
||||
fd_out_train (file): Output file for the training data set.
|
||||
fd_out_test (file): Output file for the test data set.
|
||||
target_tag (str): Target tag.
|
||||
split (float): Test data set split ratio.
|
||||
"""
|
||||
num = 1
|
||||
for line in input_lines:
|
||||
try:
|
||||
fd_out = fd_out_train if random.random() > split else fd_out_test
|
||||
attr = xml.etree.ElementTree.fromstring(line).attrib
|
||||
|
||||
pid = attr.get("Id", "")
|
||||
label = 1 if target_tag in attr.get("Tags", "") else 0
|
||||
title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
|
||||
body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
|
||||
text = title + " " + body
|
||||
|
||||
fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
|
||||
|
||||
num += 1
|
||||
except Exception as ex:
|
||||
sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
|
||||
|
||||
|
||||
def main():
|
||||
params = yaml.safe_load(open("params.yaml"))["prepare"]
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
sys.stderr.write("Arguments error. Usage:\n")
|
||||
sys.stderr.write("\tpython prepare.py data-file\n")
|
||||
sys.exit(1)
|
||||
|
||||
# Test data set split ratio
|
||||
split = params["split"]
|
||||
random.seed(params["seed"])
|
||||
|
||||
input = sys.argv[1]
|
||||
output_train = os.path.join("data", "prepared", "train.tsv")
|
||||
output_test = os.path.join("data", "prepared", "test.tsv")
|
||||
|
||||
os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
|
||||
|
||||
input_lines = []
|
||||
with open(input) as fd_in:
|
||||
input_lines = fd_in.readlines()
|
||||
|
||||
fd_out_train = open(output_train, "w", encoding="utf-8")
|
||||
fd_out_test = open(output_test, "w", encoding="utf-8")
|
||||
|
||||
process_posts(
|
||||
input_lines=input_lines,
|
||||
fd_out_train=fd_out_train,
|
||||
fd_out_test=fd_out_test,
|
||||
target_tag="<r>",
|
||||
split=split,
|
||||
)
|
||||
|
||||
fd_out_train.close()
|
||||
fd_out_test.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
6
src/requirements.txt
Normal file
6
src/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
dvclive>=3.0
|
||||
pandas
|
||||
pyaml
|
||||
scikit-learn>=1.3
|
||||
scipy
|
||||
matplotlib
|
||||
65
src/train.py
Normal file
65
src/train.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def train(seed, n_est, min_split, matrix):
|
||||
"""
|
||||
Train a random forest classifier.
|
||||
|
||||
Args:
|
||||
seed (int): Random seed.
|
||||
n_est (int): Number of trees in the forest.
|
||||
min_split (int): Minimum number of samples required to split an internal node.
|
||||
matrix (scipy.sparse.csr_matrix): Input matrix.
|
||||
|
||||
Returns:
|
||||
sklearn.ensemble.RandomForestClassifier: Trained classifier.
|
||||
"""
|
||||
labels = np.squeeze(matrix[:, 1].toarray())
|
||||
x = matrix[:, 2:]
|
||||
|
||||
sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
|
||||
sys.stderr.write("X matrix size {}\n".format(x.shape))
|
||||
sys.stderr.write("Y matrix size {}\n".format(labels.shape))
|
||||
|
||||
clf = RandomForestClassifier(
|
||||
n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
|
||||
)
|
||||
|
||||
clf.fit(x, labels)
|
||||
|
||||
return clf
|
||||
|
||||
|
||||
def main():
|
||||
params = yaml.safe_load(open("params.yaml"))["train"]
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
sys.stderr.write("Arguments error. Usage:\n")
|
||||
sys.stderr.write("\tpython train.py features model\n")
|
||||
sys.exit(1)
|
||||
|
||||
input = sys.argv[1]
|
||||
output = sys.argv[2]
|
||||
seed = params["seed"]
|
||||
n_est = params["n_est"]
|
||||
min_split = params["min_split"]
|
||||
|
||||
# Load the data
|
||||
with open(os.path.join(input, "train.pkl"), "rb") as fd:
|
||||
matrix, _ = pickle.load(fd)
|
||||
|
||||
clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
|
||||
|
||||
# Save the model
|
||||
with open(output, "wb") as fd:
|
||||
pickle.dump(clf, fd)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user