125 lines
4.2 KiB
Python
125 lines
4.2 KiB
Python
import re
|
||
from bs4 import BeautifulSoup
|
||
|
||
def simplify(text):
|
||
mapping = {
|
||
'東': '东',
|
||
'聲': '声',
|
||
'韻': '韵',
|
||
'詞': '词',
|
||
'條': '条',
|
||
'對': '对',
|
||
'語': '语',
|
||
'摘': '摘',
|
||
'句': '句',
|
||
'卷': '卷',
|
||
'紅': '红',
|
||
'銅': '铜',
|
||
}
|
||
for k, v in mapping.items():
|
||
text = text.replace(k, v)
|
||
return text
|
||
|
||
def parse_html(file_path):
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
soup = BeautifulSoup(f.read(), "html.parser")
|
||
|
||
poem_div = soup.find("div", class_="poem")
|
||
if not poem_div:
|
||
return {}
|
||
|
||
text = poem_div.get_text()
|
||
|
||
# Extract Tone and Rhyme
|
||
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
|
||
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
|
||
|
||
if tone_rhyme_m:
|
||
raw_rhyme = tone_rhyme_m.group(2).strip()
|
||
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
|
||
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
|
||
else:
|
||
current_rhyme = ""
|
||
|
||
# Extract Volume
|
||
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
|
||
if vol_m:
|
||
num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
|
||
v1 = vol_m.group(1)
|
||
v1_digit = "".join(num_map.get(c, c) for c in v1)
|
||
if len(v1_digit) == 1:
|
||
v1_str = f"0{v1_digit}"
|
||
else:
|
||
v1_str = v1_digit
|
||
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
|
||
v2 = vol_m.group(2).split('\n')[0].strip()
|
||
current_vol = f"{v1_str}之{v2}"
|
||
else:
|
||
current_vol = ""
|
||
|
||
# Extract chars
|
||
lines = text.split('\n')
|
||
rhyme_chars = []
|
||
for i, line in enumerate(lines):
|
||
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
|
||
chars_line = lines[i+1]
|
||
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
|
||
break
|
||
|
||
clean_lines = []
|
||
for line in lines:
|
||
stripped = line.strip()
|
||
if not stripped: continue
|
||
if stripped == "欽定四庫全書": continue
|
||
if stripped.startswith("御定佩文韻府"): continue
|
||
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
|
||
if "韻" in stripped: continue
|
||
if all(c in rhyme_chars or c in ' ' for c in stripped):
|
||
continue
|
||
clean_lines.append(stripped)
|
||
|
||
clean_text = "".join(clean_lines)
|
||
|
||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
|
||
|
||
result = {}
|
||
current_char = None
|
||
|
||
for word, desc_blocks in tokens:
|
||
word = word.strip()
|
||
desc_content = desc_blocks.replace('〈', '').replace('〉', '')
|
||
|
||
# Is this a main character definition?
|
||
if word in rhyme_chars:
|
||
current_char = word
|
||
simplified_char = simplify(current_char)
|
||
# Create a new entry in result
|
||
key = f"(韵母){simplified_char}"
|
||
if key not in result:
|
||
result[key] = {
|
||
"卷": current_vol,
|
||
"声": current_tone,
|
||
"韵": current_rhyme,
|
||
"小韵描述": simplify(current_char + desc_content),
|
||
"词条": {},
|
||
"对语": "",
|
||
"摘句": ""
|
||
}
|
||
elif word == "對語" or word == "对语":
|
||
if current_char:
|
||
key = f"(韵母){simplify(current_char)}"
|
||
# Could be multiple parts, though usually one per character block
|
||
result[key]["对语"] += desc_content
|
||
elif word == "摘句":
|
||
if current_char:
|
||
key = f"(韵母){simplify(current_char)}"
|
||
result[key]["摘句"] += desc_content
|
||
else:
|
||
# It's a 词条
|
||
if current_char and word:
|
||
key = f"(韵母){simplify(current_char)}"
|
||
result[key]["词条"][word] = desc_content
|
||
|
||
return result
|
||
|