import re from bs4 import BeautifulSoup with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") poem_div = soup.find("div", class_="poem") text = poem_div.get_text() # Extract the list of characters. # It appears after "一東韻一" or similar. m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text) if m: print("Tone:", m.group(1)) print("Rhyme:", m.group(2)) print("Chars line:", m.group(3)) rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1] print("Chars:", rhyme_chars)