Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/test_parse.py
+++ b/初学记/test_parse.py
@@ -0,0 +1,35 @@
+import bs4
+import os
+import json
+import re
+
+html_dir = "epub_extracted/OPS"
+
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+
+    poem_divs = soup.find_all("div", class_="poem")
+    texts = []
+    for div in poem_divs:
+        for p in div.find_all("p"):
+            # We want to process the text inside <p> while respecting <br/> as separators.
+            # But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
+            current_line = []
+            for child in p.children:
+                if child.name == "br":
+                    texts.append("".join(current_line).strip())
+                    current_line = []
+                else:
+                    current_line.append(child.get_text())
+            if current_line:
+                texts.append("".join(current_line).strip())
+    return texts
+
+
+texts = parse_html(
+    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
+)
+for i, t in enumerate(texts[:50]):
+    print(f"{i}: {t}")