import os import glob import json import re from parser import parse_html def natural_sort_key(s): basename = os.path.basename(s) # Match "卷XXX之YY" m = re.search(r"卷(\d+)(?:之(\d+))?", basename) if m: vol = int(m.group(1)) sub = int(m.group(2)) if m.group(2) else 0 return (1, vol, sub) return (2, 0, 0) def main(): files = glob.glob('html_files/卷*.html') files.sort(key=natural_sort_key) print(f"Starting to parse {len(files)} files...") combined_result = {} success_count = 0 fail_count = 0 for idx, fpath in enumerate(files): try: res = parse_html(fpath) for k, v in res.items(): # Remove "(韵母)" prefix clean_key = k.replace("(韵母)", "") if clean_key not in combined_result: combined_result[clean_key] = v else: # Merge entries combined_result[clean_key]["词条"].update(v.get("词条", {})) if "对语" in v and v["对语"]: combined_result[clean_key]["对语"] += v["对语"] if "摘句" in v and v["摘句"]: combined_result[clean_key]["摘句"] += v["摘句"] if not combined_result[clean_key]["卷"] and v["卷"]: combined_result[clean_key]["卷"] = v["卷"] if not combined_result[clean_key]["声"] and v["声"]: combined_result[clean_key]["声"] = v["声"] if not combined_result[clean_key]["韵"] and v["韵"]: combined_result[clean_key]["韵"] = v["韵"] success_count += 1 if idx % 50 == 0: print(f"Parsed {idx}/{len(files)} files...") except Exception as e: print(f"Failed to parse {fpath}: {e}") fail_count += 1 print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}") print(f"Total unique rhyme characters extracted: {len(combined_result)}") final_output = { "metadata": { "title": "御定佩文韵府", "author": "张玉书等奉敕编", "dynasty": "清", "total_volumes": 106, "source": "2026年3月22日从维基文库导出" }, "preface": "" } final_output.update(combined_result) with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: json.dump(final_output, f, ensure_ascii=False, indent=4) print("Saved output to peiwenyunfu.json") if __name__ == "__main__": main()