36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import bs4
|
|
import os
|
|
import json
|
|
import re
|
|
|
|
html_dir = "epub_extracted/OPS"
|
|
|
|
|
|
def parse_html(filepath):
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
|
|
|
poem_divs = soup.find_all("div", class_="poem")
|
|
texts = []
|
|
for div in poem_divs:
|
|
for p in div.find_all("p"):
|
|
# We want to process the text inside <p> while respecting <br/> as separators.
|
|
# But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
|
|
current_line = []
|
|
for child in p.children:
|
|
if child.name == "br":
|
|
texts.append("".join(current_line).strip())
|
|
current_line = []
|
|
else:
|
|
current_line.append(child.get_text())
|
|
if current_line:
|
|
texts.append("".join(current_line).strip())
|
|
return texts
|
|
|
|
|
|
texts = parse_html(
|
|
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
|
)
|
|
for i, t in enumerate(texts[:50]):
|
|
print(f"{i}: {t}")
|