Files
spider-ctext/佩文韵府/parser.py
2026-03-22 16:18:35 +08:00

125 lines
4.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from bs4 import BeautifulSoup
def simplify(text):
mapping = {
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
}
for k, v in mapping.items():
text = text.replace(k, v)
return text
def parse_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if not poem_div:
return {}
text = poem_div.get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
if tone_rhyme_m:
raw_rhyme = tone_rhyme_m.group(2).strip()
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('', ''))
else:
current_rhyme = ""
# Extract Volume
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
if vol_m:
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9', '':'10'}
v1 = vol_m.group(1)
v1_digit = "".join(num_map.get(c, c) for c in v1)
if len(v1_digit) == 1:
v1_str = f"0{v1_digit}"
else:
v1_str = v1_digit
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
v2 = vol_m.group(2).split('\n')[0].strip()
current_vol = f"{v1_str}{v2}"
else:
current_vol = ""
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
result = {}
current_char = None
for word, desc_blocks in tokens:
word = word.strip()
desc_content = desc_blocks.replace('', '').replace('', '')
# Is this a main character definition?
if word in rhyme_chars:
current_char = word
simplified_char = simplify(current_char)
# Create a new entry in result
key = f"(韵母){simplified_char}"
if key not in result:
result[key] = {
"": current_vol,
"": current_tone,
"": current_rhyme,
"小韵描述": simplify(current_char + desc_content),
"词条": {},
"对语": "",
"摘句": ""
}
elif word == "對語" or word == "对语":
if current_char:
key = f"(韵母){simplify(current_char)}"
# Could be multiple parts, though usually one per character block
result[key]["对语"] += desc_content
elif word == "摘句":
if current_char:
key = f"(韵母){simplify(current_char)}"
result[key]["摘句"] += desc_content
else:
# It's a 词条
if current_char and word:
key = f"(韵母){simplify(current_char)}"
result[key]["词条"][word] = desc_content
return result