import re from bs4 import BeautifulSoup def simplify(text): mapping = { '東': '东', '聲': '声', '韻': '韵', '詞': '词', '條': '条', '對': '对', '語': '语', '摘': '摘', '句': '句', '卷': '卷', '紅': '红', '銅': '铜', } for k, v in mapping.items(): text = text.replace(k, v) return text def parse_html(file_path): with open(file_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") poem_div = soup.find("div", class_="poem") if not poem_div: return {} text = poem_div.get_text() # Extract Tone and Rhyme tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text) current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else "" if tone_rhyme_m: raw_rhyme = tone_rhyme_m.group(2).strip() m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme) current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', '')) else: current_rhyme = "" # Extract Volume vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text) if vol_m: num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'} v1 = vol_m.group(1) v1_digit = "".join(num_map.get(c, c) for c in v1) if len(v1_digit) == 1: v1_str = f"0{v1_digit}" else: v1_str = v1_digit # Fix: the volume match might capture extra text after '之', e.g. '之一\n' v2 = vol_m.group(2).split('\n')[0].strip() current_vol = f"{v1_str}之{v2}" else: current_vol = "" # Extract chars lines = text.split('\n') rhyme_chars = [] for i, line in enumerate(lines): if tone_rhyme_m and tone_rhyme_m.group(2) in line: chars_line = lines[i+1] rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1] break clean_lines = [] for line in lines: stripped = line.strip() if not stripped: continue if stripped == "欽定四庫全書": continue if stripped.startswith("御定佩文韻府"): continue if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped: if "韻" in stripped: continue if all(c in rhyme_chars or c in '  ' for c in stripped): continue clean_lines.append(stripped) clean_text = "".join(clean_lines) tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text) result = {} current_char = None for word, desc_blocks in tokens: word = word.strip() desc_content = desc_blocks.replace('〈', '').replace('〉', '') # Is this a main character definition? if word in rhyme_chars: current_char = word simplified_char = simplify(current_char) # Create a new entry in result key = f"(韵母){simplified_char}" if key not in result: result[key] = { "卷": current_vol, "声": current_tone, "韵": current_rhyme, "小韵描述": simplify(current_char + desc_content), "词条": {}, "对语": "", "摘句": "" } elif word == "對語" or word == "对语": if current_char: key = f"(韵母){simplify(current_char)}" # Could be multiple parts, though usually one per character block result[key]["对语"] += desc_content elif word == "摘句": if current_char: key = f"(韵母){simplify(current_char)}" result[key]["摘句"] += desc_content else: # It's a 词条 if current_char and word: key = f"(韵母){simplify(current_char)}" result[key]["词条"][word] = desc_content return result