Files
spider-ctext/初学记/test_extract.py
2026-03-22 16:18:35 +08:00

25 lines
873 B
Python

import bs4
import re
import sys
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
current_section = None
poem_divs = soup.find_all("div", class_="poem")
for div in poem_divs:
# Before we process children, let's look at p tags
for p in div.find_all("p"):
for child in p.children:
if child.name == "span" and child.get("id") and child.text.strip().endswith(""):
current_section = child.text.strip()
print("Found Section:", current_section)
elif child.name == "br":
pass
elif type(child) == bs4.element.NavigableString:
pass
parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")