Files
spider-ctext/初学记/test_extract3.py
2026-03-22 16:18:35 +08:00

128 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import bs4
import os
import re
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
if current_line:
events.append(("line", "".join(current_line).strip()))
current_line = []
elif (
child.name == "span"
and child.get("id")
and child.text.strip().endswith("")
):
events.append(("section", child.text.strip()))
elif child.name == "small":
small_text = child.get_text()
if not small_text.startswith("") and not small_text.startswith(
""
):
current_line.append(f"{small_text}")
else:
current_line.append(small_text)
else:
current_line.append(child.get_text())
if current_line:
events.append(("line", "".join(current_line).strip()))
return events
def extract_sections(entry_text):
result = {"叙事": "", "事对": "", "诗文": ""}
narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
if not narrative_match:
return result
rest_after_narrative = narrative_match.group(2)
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "")
return result
events = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
entries = []
current_entry_text = []
for ev_type, text in events:
if ev_type == "section":
pass
else:
if "〈叙事〉" in text or "〈敘事〉" in text:
if current_entry_text:
full_text = "".join(current_entry_text)
match = re.search(
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
full_text,
)
if match:
entry_name = match.group(1).strip()
sections = extract_sections(full_text)
entries.append((entry_name, sections))
current_entry_text = [text]
else:
current_entry_text.append(text)
if current_entry_text:
full_text = "".join(current_entry_text)
match = re.search(
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
)
if match:
entry_name = match.group(1).strip()
sections = extract_sections(full_text)
entries.append((entry_name, sections))
for entry_name, sections in entries[:3]:
print("Entry:", entry_name)
print("叙事:", sections["叙事"][:50])
print("事对:", sections["事对"][:50])
print("诗文:", sections["诗文"][:50])
print("-" * 20)