Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/test_extract2.py
+++ b/初学记/test_extract2.py
@@ -0,0 +1,54 @@
+import bs4
+import re
+import os
+
+html_dir = "epub_extracted/OPS"
+
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+
+    poem_divs = soup.find_all("div", class_="poem")
+
+    events = []
+
+    for div in poem_divs:
+        for p in div.find_all("p"):
+            current_line = []
+            for child in p.children:
+                if child.name == "br":
+                    if current_line:
+                        events.append(("line", "".join(current_line).strip()))
+                        current_line = []
+                elif (
+                    child.name == "span"
+                    and child.get("id")
+                    and child.text.strip().endswith("部")
+                ):
+                    events.append(("section", child.text.strip()))
+                elif child.name == "small":
+                    # Convert small text to brackets
+                    small_text = child.get_text()
+                    # It might already have brackets inside.
+                    if not small_text.startswith("〈") and not small_text.startswith(
+                        "（"
+                    ):
+                        current_line.append(f"（{small_text}）")
+                    else:
+                        # Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
+                        # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
+                        # Let's look at child.get_text()
+                        current_line.append(small_text)
+                else:
+                    current_line.append(child.get_text())
+            if current_line:
+                events.append(("line", "".join(current_line).strip()))
+    return events
+
+
+events = parse_html(
+    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
+)
+for e in events[:30]:
+    print(e)