Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
124
佩文韵府/parser.py
Normal file
124
佩文韵府/parser.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def simplify(text):
|
||||
mapping = {
|
||||
'東': '东',
|
||||
'聲': '声',
|
||||
'韻': '韵',
|
||||
'詞': '词',
|
||||
'條': '条',
|
||||
'對': '对',
|
||||
'語': '语',
|
||||
'摘': '摘',
|
||||
'句': '句',
|
||||
'卷': '卷',
|
||||
'紅': '红',
|
||||
'銅': '铜',
|
||||
}
|
||||
for k, v in mapping.items():
|
||||
text = text.replace(k, v)
|
||||
return text
|
||||
|
||||
def parse_html(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_div = soup.find("div", class_="poem")
|
||||
if not poem_div:
|
||||
return {}
|
||||
|
||||
text = poem_div.get_text()
|
||||
|
||||
# Extract Tone and Rhyme
|
||||
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
|
||||
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
|
||||
|
||||
if tone_rhyme_m:
|
||||
raw_rhyme = tone_rhyme_m.group(2).strip()
|
||||
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
|
||||
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
|
||||
else:
|
||||
current_rhyme = ""
|
||||
|
||||
# Extract Volume
|
||||
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
|
||||
if vol_m:
|
||||
num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
|
||||
v1 = vol_m.group(1)
|
||||
v1_digit = "".join(num_map.get(c, c) for c in v1)
|
||||
if len(v1_digit) == 1:
|
||||
v1_str = f"0{v1_digit}"
|
||||
else:
|
||||
v1_str = v1_digit
|
||||
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
|
||||
v2 = vol_m.group(2).split('\n')[0].strip()
|
||||
current_vol = f"{v1_str}之{v2}"
|
||||
else:
|
||||
current_vol = ""
|
||||
|
||||
# Extract chars
|
||||
lines = text.split('\n')
|
||||
rhyme_chars = []
|
||||
for i, line in enumerate(lines):
|
||||
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
|
||||
chars_line = lines[i+1]
|
||||
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
|
||||
break
|
||||
|
||||
clean_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped: continue
|
||||
if stripped == "欽定四庫全書": continue
|
||||
if stripped.startswith("御定佩文韻府"): continue
|
||||
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
|
||||
if "韻" in stripped: continue
|
||||
if all(c in rhyme_chars or c in ' ' for c in stripped):
|
||||
continue
|
||||
clean_lines.append(stripped)
|
||||
|
||||
clean_text = "".join(clean_lines)
|
||||
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
|
||||
|
||||
result = {}
|
||||
current_char = None
|
||||
|
||||
for word, desc_blocks in tokens:
|
||||
word = word.strip()
|
||||
desc_content = desc_blocks.replace('〈', '').replace('〉', '')
|
||||
|
||||
# Is this a main character definition?
|
||||
if word in rhyme_chars:
|
||||
current_char = word
|
||||
simplified_char = simplify(current_char)
|
||||
# Create a new entry in result
|
||||
key = f"(韵母){simplified_char}"
|
||||
if key not in result:
|
||||
result[key] = {
|
||||
"卷": current_vol,
|
||||
"声": current_tone,
|
||||
"韵": current_rhyme,
|
||||
"小韵描述": simplify(current_char + desc_content),
|
||||
"词条": {},
|
||||
"对语": "",
|
||||
"摘句": ""
|
||||
}
|
||||
elif word == "對語" or word == "对语":
|
||||
if current_char:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
# Could be multiple parts, though usually one per character block
|
||||
result[key]["对语"] += desc_content
|
||||
elif word == "摘句":
|
||||
if current_char:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
result[key]["摘句"] += desc_content
|
||||
else:
|
||||
# It's a 词条
|
||||
if current_char and word:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
result[key]["词条"][word] = desc_content
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user