spider-ctext/佩文韵府/parse_all.py

import os
import glob
import json
import re
from parser import parse_html

def natural_sort_key(s):
    basename = os.path.basename(s)
    # Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
    # Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
    if m:
        vol = int(m.group(1))
        sub = int(m.group(2)) if m.group(2) else 0
        return (1, vol, sub)
    # fallback
    return (2, 0, 0)

def main():
    # Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
    files = glob.glob('html_files/卷*.html')
    files.sort(key=natural_sort_key)

    print(f"Starting to parse {len(files)} files...")

    combined_result = {}
    success_count = 0
    fail_count = 0

    for idx, fpath in enumerate(files):
        try:
            res = parse_html(fpath)
            for k, v in res.items():
                # Remove "（韵母）" prefix
                clean_key = k.replace("（韵母）", "")

                if clean_key not in combined_result:
                    combined_result[clean_key] = v
                else:
                    # Merge entries
                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
                    if "对语" in v and v["对语"]:
                        combined_result[clean_key]["对语"] += v["对语"]
                    if "摘句" in v and v["摘句"]:
                        combined_result[clean_key]["摘句"] += v["摘句"]

                    # Also, if the initial file didn't have "卷" properly parsed, update it
                    if not combined_result[clean_key]["卷"] and v["卷"]:
                        combined_result[clean_key]["卷"] = v["卷"]
                    if not combined_result[clean_key]["声"] and v["声"]:
                        combined_result[clean_key]["声"] = v["声"]
                    if not combined_result[clean_key]["韵"] and v["韵"]:
                        combined_result[clean_key]["韵"] = v["韵"]

            success_count += 1
            if idx % 50 == 0:
                print(f"Parsed {idx}/{len(files)} files...")
        except Exception as e:
            print(f"Failed to parse {fpath}: {e}")
            fail_count += 1

    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
    print(f"Total unique rhyme characters extracted: {len(combined_result)}")

    # Construct final output with metadata
    final_output = {
        "metadata": {
            "title": "御定佩文韵府",
            "author": "张玉书等",
            "dynasty": "清",
            "total_volumes": 106,
            "source": "2026年3月22日从维基文库导出"
        },
        "preface": "",
        "content": combined_result
    }

    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
        json.dump(final_output, f, ensure_ascii=False, indent=4)
    print("Saved output to peiwenyunfu.json")

if __name__ == "__main__":
    main()