From 620d1dfb2d9e3a1be7adc24fefa81368ecee8cb0 Mon Sep 17 00:00:00 2001 From: Cunliang Kong Date: Sat, 14 Mar 2026 21:44:33 +0800 Subject: [PATCH] pipeline defined --- .gitignore | 1 + data/.gitignore | 2 ++ dvc.lock | 23 +++++++++++++++++++++++ dvc.yaml | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+) create mode 100644 dvc.lock create mode 100644 dvc.yaml diff --git a/.gitignore b/.gitignore index 1d17dae..8fdb044 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .venv +/model.pkl diff --git a/data/.gitignore b/data/.gitignore index fc12be5..830cc79 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1 +1,3 @@ /data.xml +/prepared +/features diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..bb09739 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,23 @@ +schema: '2.0' +stages: + prepare: + cmd: python src/prepare.py data/data.xml + deps: + - path: data/data.xml + hash: md5 + md5: 22a1a2931c8370d3aeedd7183606fd7f + size: 14445097 + - path: src/prepare.py + hash: md5 + md5: f54d670ac8a4f63206781fc31d1f2651 + size: 2231 + params: + params.yaml: + prepare.seed: 20170428 + prepare.split: 0.2 + outs: + - path: data/prepared + hash: md5 + md5: 153aad06d376b6595932470e459ef42a.dir + size: 8437363 + nfiles: 2 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..fa6d406 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,32 @@ +stages: + prepare: + cmd: python src/prepare.py data/data.xml + deps: + - data/data.xml + - src/prepare.py + params: + - prepare.seed + - prepare.split + outs: + - data/prepared + featurize: + cmd: python src/featurization.py data/prepared data/features + deps: + - data/prepared + - src/featurization.py + params: + - featurize.max_features + - featurize.ngrams + outs: + - data/features + train: + cmd: python src/train.py data/features model.pkl + deps: + - data/features + - src/train.py + params: + - train.min_split + - train.n_est + - train.seed + outs: + - model.pkl