import re from bs4 import BeautifulSoup with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") text = soup.find("div", class_="poem").get_text() # Extract Tone and Rhyme tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text) if tone_rhyme_m: print("Tone:", tone_rhyme_m.group(1)) print("Rhyme:", tone_rhyme_m.group(2)) # Extract Volume vol_m = re.search(r"卷(.)之(.)", text) if vol_m: print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}") # Extract chars lines = text.split('\n') rhyme_chars = [] for i, line in enumerate(lines): if tone_rhyme_m.group(2) in line: # the next line usually has the characters chars_line = lines[i+1] rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1] break print("Chars:", rhyme_chars) # Now, we want to strip all lines that are headers. # Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..." # We can just filter out these known header lines! clean_lines = [] for line in lines: stripped = line.strip() if not stripped: continue if stripped == "欽定四庫全書": continue if stripped.startswith("御定佩文韻府"): continue if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped: if "韻" in stripped: continue # Is it the chars line? if all(c in rhyme_chars or c in '  ' for c in stripped): continue clean_lines.append(stripped) clean_text = "".join(clean_lines) print("Start of clean text:", clean_text[:100]) tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text) print("First token:", tokens[0])