Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/test_extract.py
+++ b/初学记/test_extract.py
@@ -0,0 +1,24 @@
+import bs4
+import re
+import sys
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+    
+    current_section = None
+    
+    poem_divs = soup.find_all("div", class_="poem")
+    for div in poem_divs:
+        # Before we process children, let's look at p tags
+        for p in div.find_all("p"):
+            for child in p.children:
+                if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
+                    current_section = child.text.strip()
+                    print("Found Section:", current_section)
+                elif child.name == "br":
+                    pass
+                elif type(child) == bs4.element.NavigableString:
+                    pass
+
+parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")