Files
spider-ctext/初学记/test_extract2.py
2026-03-22 16:18:35 +08:00

55 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import bs4
import re
import os
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
if current_line:
events.append(("line", "".join(current_line).strip()))
current_line = []
elif (
child.name == "span"
and child.get("id")
and child.text.strip().endswith("")
):
events.append(("section", child.text.strip()))
elif child.name == "small":
# Convert small text to brackets
small_text = child.get_text()
# It might already have brackets inside.
if not small_text.startswith("") and not small_text.startswith(
""
):
current_line.append(f"{small_text}")
else:
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
# Let's look at child.get_text()
current_line.append(small_text)
else:
current_line.append(child.get_text())
if current_line:
events.append(("line", "".join(current_line).strip()))
return events
events = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for e in events[:30]:
print(e)