Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/test_split.py
+++ b/佩文韵府/test_split.py
@@ -0,0 +1,18 @@
+import re
+from bs4 import BeautifulSoup
+
+with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
+    soup = BeautifulSoup(f.read(), "html.parser")
+
+poem_div = soup.find("div", class_="poem")
+text = poem_div.get_text()
+
+# Extract the list of characters.
+# It appears after "一東韻一" or similar.
+m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
+if m:
+    print("Tone:", m.group(1))
+    print("Rhyme:", m.group(2))
+    print("Chars line:", m.group(3))
+    rhyme_chars = [c for c in m.group(3).replace('　', ' ').split() if len(c) == 1]
+    print("Chars:", rhyme_chars)