Update: 初学记、佩文韵府 and 五车韵瑞

This commit is contained in:
denglifan
2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions

View File

@@ -0,0 +1,54 @@
import bs4
import re
import os
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
if current_line:
events.append(("line", "".join(current_line).strip()))
current_line = []
elif (
child.name == "span"
and child.get("id")
and child.text.strip().endswith("")
):
events.append(("section", child.text.strip()))
elif child.name == "small":
# Convert small text to brackets
small_text = child.get_text()
# It might already have brackets inside.
if not small_text.startswith("") and not small_text.startswith(
""
):
current_line.append(f"{small_text}")
else:
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
# Let's look at child.get_text()
current_line.append(small_text)
else:
current_line.append(child.get_text())
if current_line:
events.append(("line", "".join(current_line).strip()))
return events
events = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for e in events[:30]:
print(e)