Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/parser.py
+++ b/佩文韵府/parser.py
@@ -0,0 +1,124 @@
+import re
+from bs4 import BeautifulSoup
+
+def simplify(text):
+    mapping = {
+        '東': '东',
+        '聲': '声',
+        '韻': '韵',
+        '詞': '词',
+        '條': '条',
+        '對': '对',
+        '語': '语',
+        '摘': '摘',
+        '句': '句',
+        '卷': '卷',
+        '紅': '红',
+        '銅': '铜',
+    }
+    for k, v in mapping.items():
+        text = text.replace(k, v)
+    return text
+
+def parse_html(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        soup = BeautifulSoup(f.read(), "html.parser")
+    
+    poem_div = soup.find("div", class_="poem")
+    if not poem_div:
+        return {}
+        
+    text = poem_div.get_text()
+    
+    # Extract Tone and Rhyme
+    tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
+    current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
+    
+    if tone_rhyme_m:
+        raw_rhyme = tone_rhyme_m.group(2).strip()
+        m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
+        current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
+    else:
+        current_rhyme = ""
+    
+    # Extract Volume
+    vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
+    if vol_m:
+        num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
+        v1 = vol_m.group(1)
+        v1_digit = "".join(num_map.get(c, c) for c in v1)
+        if len(v1_digit) == 1:
+            v1_str = f"0{v1_digit}"
+        else:
+            v1_str = v1_digit
+        # Fix: the volume match might capture extra text after '之', e.g. '之一\n'
+        v2 = vol_m.group(2).split('\n')[0].strip()
+        current_vol = f"{v1_str}之{v2}"
+    else:
+        current_vol = ""
+        
+    # Extract chars
+    lines = text.split('\n')
+    rhyme_chars = []
+    for i, line in enumerate(lines):
+        if tone_rhyme_m and tone_rhyme_m.group(2) in line:
+            chars_line = lines[i+1]
+            rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
+            break
+            
+    clean_lines = []
+    for line in lines:
+        stripped = line.strip()
+        if not stripped: continue
+        if stripped == "欽定四庫全書": continue
+        if stripped.startswith("御定佩文韻府"): continue
+        if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
+            if "韻" in stripped: continue
+        if all(c in rhyme_chars or c in ' 　' for c in stripped):
+            continue
+        clean_lines.append(stripped)
+
+    clean_text = "".join(clean_lines)
+    
+    tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
+    
+    result = {}
+    current_char = None
+    
+    for word, desc_blocks in tokens:
+        word = word.strip()
+        desc_content = desc_blocks.replace('〈', '').replace('〉', '')
+        
+        # Is this a main character definition?
+        if word in rhyme_chars:
+            current_char = word
+            simplified_char = simplify(current_char)
+            # Create a new entry in result
+            key = f"（韵母）{simplified_char}"
+            if key not in result:
+                result[key] = {
+                    "卷": current_vol,
+                    "声": current_tone,
+                    "韵": current_rhyme,
+                    "小韵描述": simplify(current_char + desc_content),
+                    "词条": {},
+                    "对语": "",
+                    "摘句": ""
+                }
+        elif word == "對語" or word == "对语":
+            if current_char:
+                key = f"（韵母）{simplify(current_char)}"
+                # Could be multiple parts, though usually one per character block
+                result[key]["对语"] += desc_content
+        elif word == "摘句":
+            if current_char:
+                key = f"（韵母）{simplify(current_char)}"
+                result[key]["摘句"] += desc_content
+        else:
+            # It's a 词条
+            if current_char and word:
+                key = f"（韵母）{simplify(current_char)}"
+                result[key]["词条"][word] = desc_content
+                
+    return result
+