Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
79
佩文韵府/parse_all_v2.py
Normal file
79
佩文韵府/parse_all_v2.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
from parser import parse_html
|
||||
|
||||
def natural_sort_key(s):
|
||||
basename = os.path.basename(s)
|
||||
# Match "卷XXX之YY"
|
||||
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
|
||||
if m:
|
||||
vol = int(m.group(1))
|
||||
sub = int(m.group(2)) if m.group(2) else 0
|
||||
return (1, vol, sub)
|
||||
return (2, 0, 0)
|
||||
|
||||
def main():
|
||||
files = glob.glob('html_files/卷*.html')
|
||||
files.sort(key=natural_sort_key)
|
||||
|
||||
print(f"Starting to parse {len(files)} files...")
|
||||
|
||||
combined_result = {}
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for idx, fpath in enumerate(files):
|
||||
try:
|
||||
res = parse_html(fpath)
|
||||
for k, v in res.items():
|
||||
# Remove "(韵母)" prefix
|
||||
clean_key = k.replace("(韵母)", "")
|
||||
|
||||
if clean_key not in combined_result:
|
||||
combined_result[clean_key] = v
|
||||
else:
|
||||
# Merge entries
|
||||
combined_result[clean_key]["词条"].update(v.get("词条", {}))
|
||||
if "对语" in v and v["对语"]:
|
||||
combined_result[clean_key]["对语"] += v["对语"]
|
||||
if "摘句" in v and v["摘句"]:
|
||||
combined_result[clean_key]["摘句"] += v["摘句"]
|
||||
|
||||
if not combined_result[clean_key]["卷"] and v["卷"]:
|
||||
combined_result[clean_key]["卷"] = v["卷"]
|
||||
if not combined_result[clean_key]["声"] and v["声"]:
|
||||
combined_result[clean_key]["声"] = v["声"]
|
||||
if not combined_result[clean_key]["韵"] and v["韵"]:
|
||||
combined_result[clean_key]["韵"] = v["韵"]
|
||||
|
||||
success_count += 1
|
||||
if idx % 50 == 0:
|
||||
print(f"Parsed {idx}/{len(files)} files...")
|
||||
except Exception as e:
|
||||
print(f"Failed to parse {fpath}: {e}")
|
||||
fail_count += 1
|
||||
|
||||
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
|
||||
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
|
||||
|
||||
final_output = {
|
||||
"metadata": {
|
||||
"title": "御定佩文韵府",
|
||||
"author": "张玉书等奉敕编",
|
||||
"dynasty": "清",
|
||||
"total_volumes": 106,
|
||||
"source": "2026年3月22日从维基文库导出"
|
||||
},
|
||||
"preface": ""
|
||||
}
|
||||
|
||||
final_output.update(combined_result)
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(final_output, f, ensure_ascii=False, indent=4)
|
||||
print("Saved output to peiwenyunfu.json")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user