Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/test_tokenize.py
+++ b/佩文韵府/test_tokenize.py
@@ -0,0 +1,21 @@
+import re
+from bs4 import BeautifulSoup
+
+with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
+    soup = BeautifulSoup(f.read(), "html.parser")
+
+text = soup.find("div", class_="poem").get_text().replace('\n', '')
+
+# Remove header junk for this test (just find the first '東〈')
+start_idx = text.find('東〈')
+text = text[start_idx:]
+
+# Tokenize into pairs of (Word, Description)
+# Using regex to find all Word〈Description〉
+# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
+# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
+tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
+
+for i, (word, desc_blocks) in enumerate(tokens[:20]):
+    print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")
+