Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/test_extract3.py
+++ b/初学记/test_extract3.py
@@ -0,0 +1,127 @@
+import bs4
+import os
+import re
+
+html_dir = "epub_extracted/OPS"
+
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+
+    poem_divs = soup.find_all("div", class_="poem")
+
+    events = []
+    for div in poem_divs:
+        for p in div.find_all("p"):
+            current_line = []
+            for child in p.children:
+                if child.name == "br":
+                    if current_line:
+                        events.append(("line", "".join(current_line).strip()))
+                        current_line = []
+                elif (
+                    child.name == "span"
+                    and child.get("id")
+                    and child.text.strip().endswith("部")
+                ):
+                    events.append(("section", child.text.strip()))
+                elif child.name == "small":
+                    small_text = child.get_text()
+                    if not small_text.startswith("〈") and not small_text.startswith(
+                        "（"
+                    ):
+                        current_line.append(f"〈{small_text}〉")
+                    else:
+                        current_line.append(small_text)
+                else:
+                    current_line.append(child.get_text())
+            if current_line:
+                events.append(("line", "".join(current_line).strip()))
+    return events
+
+
+def extract_sections(entry_text):
+    result = {"叙事": "", "事对": "", "诗文": ""}
+
+    narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
+    if not narrative_match:
+        return result
+
+    rest_after_narrative = narrative_match.group(2)
+
+    shìduì_start = rest_after_narrative.find("事對")
+    if shìduì_start == -1:
+        shìduì_start = rest_after_narrative.find("事对")
+
+    if shìduì_start != -1:
+        result["叙事"] = rest_after_narrative[:shìduì_start]
+        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
+
+        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
+        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
+
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1
+            result["事对"] = rest_after_shiduì[:split_idx]
+            result["诗文"] = rest_after_shiduì[split_idx:]
+        else:
+            result["事对"] = rest_after_shiduì
+    else:
+        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
+        shiwen_match = re.search(genre_pattern, rest_after_narrative)
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1
+            result["叙事"] = rest_after_narrative[:split_idx]
+            result["诗文"] = rest_after_narrative[split_idx:]
+        else:
+            result["叙事"] = rest_after_narrative
+
+    for k in result:
+        result[k] = result[k].replace("〈", "（").replace("〉", "）")
+
+    return result
+
+
+events = parse_html(
+    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
+)
+
+entries = []
+current_entry_text = []
+
+for ev_type, text in events:
+    if ev_type == "section":
+        pass
+    else:
+        if "〈叙事〉" in text or "〈敘事〉" in text:
+            if current_entry_text:
+                full_text = "".join(current_entry_text)
+                match = re.search(
+                    r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
+                    full_text,
+                )
+                if match:
+                    entry_name = match.group(1).strip()
+                    sections = extract_sections(full_text)
+                    entries.append((entry_name, sections))
+            current_entry_text = [text]
+        else:
+            current_entry_text.append(text)
+
+if current_entry_text:
+    full_text = "".join(current_entry_text)
+    match = re.search(
+        r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
+    )
+    if match:
+        entry_name = match.group(1).strip()
+        sections = extract_sections(full_text)
+        entries.append((entry_name, sections))
+
+for entry_name, sections in entries[:3]:
+    print("Entry:", entry_name)
+    print("叙事:", sections["叙事"][:50])
+    print("事对:", sections["事对"][:50])
+    print("诗文:", sections["诗文"][:50])
+    print("-" * 20)