Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/test_clean.py
+++ b/佩文韵府/test_clean.py
@@ -0,0 +1,53 @@
+import re
+from bs4 import BeautifulSoup
+
+with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
+    soup = BeautifulSoup(f.read(), "html.parser")
+
+text = soup.find("div", class_="poem").get_text()
+
+# Extract Tone and Rhyme
+tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
+if tone_rhyme_m:
+    print("Tone:", tone_rhyme_m.group(1))
+    print("Rhyme:", tone_rhyme_m.group(2))
+
+# Extract Volume
+vol_m = re.search(r"卷(.)之(.)", text)
+if vol_m:
+    print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}")
+
+# Extract chars
+lines = text.split('\n')
+rhyme_chars = []
+for i, line in enumerate(lines):
+    if tone_rhyme_m.group(2) in line:
+        # the next line usually has the characters
+        chars_line = lines[i+1]
+        rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
+        break
+
+print("Chars:", rhyme_chars)
+
+# Now, we want to strip all lines that are headers.
+# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
+# We can just filter out these known header lines!
+clean_lines = []
+for line in lines:
+    stripped = line.strip()
+    if not stripped: continue
+    if stripped == "欽定四庫全書": continue
+    if stripped.startswith("御定佩文韻府"): continue
+    if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
+        if "韻" in stripped: continue
+    # Is it the chars line?
+    if all(c in rhyme_chars or c in ' 　' for c in stripped):
+        continue
+    clean_lines.append(stripped)
+
+clean_text = "".join(clean_lines)
+print("Start of clean text:", clean_text[:100])
+
+tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
+print("First token:", tokens[0])
+