Files
spider-ctext/佩文韵府/parse_all.py
2026-03-22 16:18:35 +08:00

84 lines
3.1 KiB
Python

import os
import glob
import json
import re
from parser import parse_html
def natural_sort_key(s):
basename = os.path.basename(s)
# Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
# Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
if m:
vol = int(m.group(1))
sub = int(m.group(2)) if m.group(2) else 0
return (1, vol, sub)
# fallback
return (2, 0, 0)
def main():
# Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
files = glob.glob('html_files/卷*.html')
files.sort(key=natural_sort_key)
print(f"Starting to parse {len(files)} files...")
combined_result = {}
success_count = 0
fail_count = 0
for idx, fpath in enumerate(files):
try:
res = parse_html(fpath)
for k, v in res.items():
# Remove "(韵母)" prefix
clean_key = k.replace("(韵母)", "")
if clean_key not in combined_result:
combined_result[clean_key] = v
else:
# Merge entries
combined_result[clean_key]["词条"].update(v.get("词条", {}))
if "对语" in v and v["对语"]:
combined_result[clean_key]["对语"] += v["对语"]
if "摘句" in v and v["摘句"]:
combined_result[clean_key]["摘句"] += v["摘句"]
# Also, if the initial file didn't have "卷" properly parsed, update it
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
success_count += 1
if idx % 50 == 0:
print(f"Parsed {idx}/{len(files)} files...")
except Exception as e:
print(f"Failed to parse {fpath}: {e}")
fail_count += 1
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
# Construct final output with metadata
final_output = {
"metadata": {
"title": "御定佩文韵府",
"author": "张玉书等",
"dynasty": "",
"total_volumes": 106,
"source": "2026年3月22日从维基文库导出"
},
"preface": "",
"content": combined_result
}
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=4)
print("Saved output to peiwenyunfu.json")
if __name__ == "__main__":
main()