Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/test_parse2.py
+++ b/初学记/test_parse2.py
@@ -0,0 +1,93 @@
+import bs4
+import os
+import json
+import re
+
+html_dir = "epub_extracted/OPS"
+
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+
+    poem_divs = soup.find_all("div", class_="poem")
+    texts = []
+    for div in poem_divs:
+        for p in div.find_all("p"):
+            current_line = []
+            for child in p.children:
+                if child.name == "br":
+                    texts.append("".join(current_line).strip())
+                    current_line = []
+                else:
+                    current_line.append(child.get_text())
+            if current_line:
+                texts.append("".join(current_line).strip())
+    return texts
+
+
+def extract_sections(entry_text):
+    result = {"叙事": "", "事对": "", "诗文": ""}
+
+    # Extract 叙事
+    narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
+    if not narrative_match:
+        return result
+
+    rest_after_narrative = narrative_match.group(1)
+
+    # Find 事对
+    shìduì_start = rest_after_narrative.find("事對")
+    if shìduì_start == -1:
+        shìduì_start = rest_after_narrative.find("事对")
+
+    if shìduì_start != -1:
+        result["叙事"] = rest_after_narrative[:shìduì_start]
+        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]  # skip "事對"
+
+        # Find 诗文 start
+        # Match 〉 followed by a literary genre
+        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
+        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
+
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1  # keep the genre character
+            result["事对"] = rest_after_shiduì[:split_idx]
+            result["诗文"] = rest_after_shiduì[split_idx:]
+        else:
+            result["事对"] = rest_after_shiduì
+    else:
+        # No 事对
+        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
+        shiwen_match = re.search(genre_pattern, rest_after_narrative)
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1
+            result["叙事"] = rest_after_narrative[:split_idx]
+            result["诗文"] = rest_after_narrative[split_idx:]
+        else:
+            result["叙事"] = rest_after_narrative
+
+    return result
+
+
+texts = parse_html(
+    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
+)
+entries = []
+for line in texts:
+    if "〈叙事〉" in line:
+        entry_name = line.split("〈叙事〉")[0]
+        # remove "第X" from entry_name
+        word_name = re.sub(
+            r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
+        ).strip()
+        sections = extract_sections(line)
+        entries.append((word_name, entry_name, sections))
+
+for e in entries[:3]:
+    print(f"Word: {e[0]}")
+    print(f"Entry: {e[1]}")
+    print(f"叙事 len: {len(e[2]['叙事'])}")
+    print(f"事对 len: {len(e[2]['事对'])}")
+    print(f"诗文 len: {len(e[2]['诗文'])}")
+    print("-" * 20)