Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
53
佩文韵府/test_clean.py
Normal file
53
佩文韵府/test_clean.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
text = soup.find("div", class_="poem").get_text()
|
||||
|
||||
# Extract Tone and Rhyme
|
||||
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
|
||||
if tone_rhyme_m:
|
||||
print("Tone:", tone_rhyme_m.group(1))
|
||||
print("Rhyme:", tone_rhyme_m.group(2))
|
||||
|
||||
# Extract Volume
|
||||
vol_m = re.search(r"卷(.)之(.)", text)
|
||||
if vol_m:
|
||||
print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}")
|
||||
|
||||
# Extract chars
|
||||
lines = text.split('\n')
|
||||
rhyme_chars = []
|
||||
for i, line in enumerate(lines):
|
||||
if tone_rhyme_m.group(2) in line:
|
||||
# the next line usually has the characters
|
||||
chars_line = lines[i+1]
|
||||
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
|
||||
break
|
||||
|
||||
print("Chars:", rhyme_chars)
|
||||
|
||||
# Now, we want to strip all lines that are headers.
|
||||
# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
|
||||
# We can just filter out these known header lines!
|
||||
clean_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped: continue
|
||||
if stripped == "欽定四庫全書": continue
|
||||
if stripped.startswith("御定佩文韻府"): continue
|
||||
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
|
||||
if "韻" in stripped: continue
|
||||
# Is it the chars line?
|
||||
if all(c in rhyme_chars or c in ' ' for c in stripped):
|
||||
continue
|
||||
clean_lines.append(stripped)
|
||||
|
||||
clean_text = "".join(clean_lines)
|
||||
print("Start of clean text:", clean_text[:100])
|
||||
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
|
||||
print("First token:", tokens[0])
|
||||
|
||||
Reference in New Issue
Block a user