Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
54
初学记/test_extract2.py
Normal file
54
初学记/test_extract2.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
|
||||
events = []
|
||||
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
current_line = []
|
||||
elif (
|
||||
child.name == "span"
|
||||
and child.get("id")
|
||||
and child.text.strip().endswith("部")
|
||||
):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
# Convert small text to brackets
|
||||
small_text = child.get_text()
|
||||
# It might already have brackets inside.
|
||||
if not small_text.startswith("〈") and not small_text.startswith(
|
||||
"("
|
||||
):
|
||||
current_line.append(f"({small_text})")
|
||||
else:
|
||||
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
|
||||
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
|
||||
# Let's look at child.get_text()
|
||||
current_line.append(small_text)
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
return events
|
||||
|
||||
|
||||
events = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
for e in events[:30]:
|
||||
print(e)
|
||||
Reference in New Issue
Block a user