Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/parse_all.py
+++ b/佩文韵府/parse_all.py
@@ -0,0 +1,83 @@
+import os
+import glob
+import json
+import re
+from parser import parse_html
+
+def natural_sort_key(s):
+    basename = os.path.basename(s)
+    # Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
+    # Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
+    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
+    if m:
+        vol = int(m.group(1))
+        sub = int(m.group(2)) if m.group(2) else 0
+        return (1, vol, sub)
+    # fallback
+    return (2, 0, 0)
+
+def main():
+    # Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
+    files = glob.glob('html_files/卷*.html')
+    files.sort(key=natural_sort_key)
+    
+    print(f"Starting to parse {len(files)} files...")
+    
+    combined_result = {}
+    success_count = 0
+    fail_count = 0
+    
+    for idx, fpath in enumerate(files):
+        try:
+            res = parse_html(fpath)
+            for k, v in res.items():
+                # Remove "（韵母）" prefix
+                clean_key = k.replace("（韵母）", "")
+                
+                if clean_key not in combined_result:
+                    combined_result[clean_key] = v
+                else:
+                    # Merge entries
+                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
+                    if "对语" in v and v["对语"]:
+                        combined_result[clean_key]["对语"] += v["对语"]
+                    if "摘句" in v and v["摘句"]:
+                        combined_result[clean_key]["摘句"] += v["摘句"]
+                        
+                    # Also, if the initial file didn't have "卷" properly parsed, update it
+                    if not combined_result[clean_key]["卷"] and v["卷"]:
+                        combined_result[clean_key]["卷"] = v["卷"]
+                    if not combined_result[clean_key]["声"] and v["声"]:
+                        combined_result[clean_key]["声"] = v["声"]
+                    if not combined_result[clean_key]["韵"] and v["韵"]:
+                        combined_result[clean_key]["韵"] = v["韵"]
+                        
+            success_count += 1
+            if idx % 50 == 0:
+                print(f"Parsed {idx}/{len(files)} files...")
+        except Exception as e:
+            print(f"Failed to parse {fpath}: {e}")
+            fail_count += 1
+            
+    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
+    print(f"Total unique rhyme characters extracted: {len(combined_result)}")
+    
+    # Construct final output with metadata
+    final_output = {
+        "metadata": {
+            "title": "御定佩文韵府",
+            "author": "张玉书等",
+            "dynasty": "清",
+            "total_volumes": 106,
+            "source": "2026年3月22日从维基文库导出"
+        },
+        "preface": "",
+        "content": combined_result
+    }
+    
+    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
+        json.dump(final_output, f, ensure_ascii=False, indent=4)
+    print("Saved output to peiwenyunfu.json")
+
+if __name__ == "__main__":
+    main()