Update: 初学记、佩文韵府 and 五车韵瑞

This commit is contained in:
denglifan
2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions

124
佩文韵府/parser.py Normal file
View File

@@ -0,0 +1,124 @@
import re
from bs4 import BeautifulSoup
def simplify(text):
mapping = {
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
}
for k, v in mapping.items():
text = text.replace(k, v)
return text
def parse_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if not poem_div:
return {}
text = poem_div.get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
if tone_rhyme_m:
raw_rhyme = tone_rhyme_m.group(2).strip()
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('', ''))
else:
current_rhyme = ""
# Extract Volume
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
if vol_m:
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9', '':'10'}
v1 = vol_m.group(1)
v1_digit = "".join(num_map.get(c, c) for c in v1)
if len(v1_digit) == 1:
v1_str = f"0{v1_digit}"
else:
v1_str = v1_digit
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
v2 = vol_m.group(2).split('\n')[0].strip()
current_vol = f"{v1_str}{v2}"
else:
current_vol = ""
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
result = {}
current_char = None
for word, desc_blocks in tokens:
word = word.strip()
desc_content = desc_blocks.replace('', '').replace('', '')
# Is this a main character definition?
if word in rhyme_chars:
current_char = word
simplified_char = simplify(current_char)
# Create a new entry in result
key = f"(韵母){simplified_char}"
if key not in result:
result[key] = {
"": current_vol,
"": current_tone,
"": current_rhyme,
"小韵描述": simplify(current_char + desc_content),
"词条": {},
"对语": "",
"摘句": ""
}
elif word == "對語" or word == "对语":
if current_char:
key = f"(韵母){simplify(current_char)}"
# Could be multiple parts, though usually one per character block
result[key]["对语"] += desc_content
elif word == "摘句":
if current_char:
key = f"(韵母){simplify(current_char)}"
result[key]["摘句"] += desc_content
else:
# It's a 词条
if current_char and word:
key = f"(韵母){simplify(current_char)}"
result[key]["词条"][word] = desc_content
return result