spider-ctext/佩文韵府/parser.py

import re
from bs4 import BeautifulSoup

def simplify(text):
    mapping = {
        '東': '东',
        '聲': '声',
        '韻': '韵',
        '詞': '词',
        '條': '条',
        '對': '对',
        '語': '语',
        '摘': '摘',
        '句': '句',
        '卷': '卷',
        '紅': '红',
        '銅': '铜',
    }
    for k, v in mapping.items():
        text = text.replace(k, v)
    return text

def parse_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    poem_div = soup.find("div", class_="poem")
    if not poem_div:
        return {}

    text = poem_div.get_text()

    # Extract Tone and Rhyme
    tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
    current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""

    if tone_rhyme_m:
        raw_rhyme = tone_rhyme_m.group(2).strip()
        m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
        current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
    else:
        current_rhyme = ""

    # Extract Volume
    vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
    if vol_m:
        num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
        v1 = vol_m.group(1)
        v1_digit = "".join(num_map.get(c, c) for c in v1)
        if len(v1_digit) == 1:
            v1_str = f"0{v1_digit}"
        else:
            v1_str = v1_digit
        # Fix: the volume match might capture extra text after '之', e.g. '之一\n'
        v2 = vol_m.group(2).split('\n')[0].strip()
        current_vol = f"{v1_str}之{v2}"
    else:
        current_vol = ""

    # Extract chars
    lines = text.split('\n')
    rhyme_chars = []
    for i, line in enumerate(lines):
        if tone_rhyme_m and tone_rhyme_m.group(2) in line:
            chars_line = lines[i+1]
            rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
            break

    clean_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped: continue
        if stripped == "欽定四庫全書": continue
        if stripped.startswith("御定佩文韻府"): continue
        if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
            if "韻" in stripped: continue
        if all(c in rhyme_chars or c in ' 　' for c in stripped):
            continue
        clean_lines.append(stripped)

    clean_text = "".join(clean_lines)

    tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)

    result = {}
    current_char = None

    for word, desc_blocks in tokens:
        word = word.strip()
        desc_content = desc_blocks.replace('〈', '').replace('〉', '')

        # Is this a main character definition?
        if word in rhyme_chars:
            current_char = word
            simplified_char = simplify(current_char)
            # Create a new entry in result
            key = f"（韵母）{simplified_char}"
            if key not in result:
                result[key] = {
                    "卷": current_vol,
                    "声": current_tone,
                    "韵": current_rhyme,
                    "小韵描述": simplify(current_char + desc_content),
                    "词条": {},
                    "对语": "",
                    "摘句": ""
                }
        elif word == "對語" or word == "对语":
            if current_char:
                key = f"（韵母）{simplify(current_char)}"
                # Could be multiple parts, though usually one per character block
                result[key]["对语"] += desc_content
        elif word == "摘句":
            if current_char:
                key = f"（韵母）{simplify(current_char)}"
                result[key]["摘句"] += desc_content
        else:
            # It's a 词条
            if current_char and word:
                key = f"（韵母）{simplify(current_char)}"
                result[key]["词条"][word] = desc_content

    return result