Files
spider-ctext/佩文韵府/test_split.py
2026-03-22 16:18:35 +08:00

19 lines
626 B
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
text = poem_div.get_text()
# Extract the list of characters.
# It appears after "一東韻一" or similar.
m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
if m:
print("Tone:", m.group(1))
print("Rhyme:", m.group(2))
print("Chars line:", m.group(3))
rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1]
print("Chars:", rhyme_chars)