import re from bs4 import BeautifulSoup with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") text = soup.find("div", class_="poem").get_text().replace('\n', '') # Remove header junk for this test (just find the first '東〈') start_idx = text.find('東〈') text = text[start_idx:] # Tokenize into pairs of (Word, Description) # Using regex to find all Word〈Description〉 # Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉 # We can find all chunks of non-〈 characters, followed by one or more 〈...〉 tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text) for i, (word, desc_blocks) in enumerate(tokens[:20]): print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")