128 lines
4.5 KiB
Python
128 lines
4.5 KiB
Python
import bs4
|
||
import os
|
||
import re
|
||
|
||
html_dir = "epub_extracted/OPS"
|
||
|
||
|
||
def parse_html(filepath):
|
||
with open(filepath, "r", encoding="utf-8") as f:
|
||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||
|
||
poem_divs = soup.find_all("div", class_="poem")
|
||
|
||
events = []
|
||
for div in poem_divs:
|
||
for p in div.find_all("p"):
|
||
current_line = []
|
||
for child in p.children:
|
||
if child.name == "br":
|
||
if current_line:
|
||
events.append(("line", "".join(current_line).strip()))
|
||
current_line = []
|
||
elif (
|
||
child.name == "span"
|
||
and child.get("id")
|
||
and child.text.strip().endswith("部")
|
||
):
|
||
events.append(("section", child.text.strip()))
|
||
elif child.name == "small":
|
||
small_text = child.get_text()
|
||
if not small_text.startswith("〈") and not small_text.startswith(
|
||
"("
|
||
):
|
||
current_line.append(f"〈{small_text}〉")
|
||
else:
|
||
current_line.append(small_text)
|
||
else:
|
||
current_line.append(child.get_text())
|
||
if current_line:
|
||
events.append(("line", "".join(current_line).strip()))
|
||
return events
|
||
|
||
|
||
def extract_sections(entry_text):
|
||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||
|
||
narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
|
||
if not narrative_match:
|
||
return result
|
||
|
||
rest_after_narrative = narrative_match.group(2)
|
||
|
||
shìduì_start = rest_after_narrative.find("事對")
|
||
if shìduì_start == -1:
|
||
shìduì_start = rest_after_narrative.find("事对")
|
||
|
||
if shìduì_start != -1:
|
||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||
|
||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||
|
||
if shiwen_match:
|
||
split_idx = shiwen_match.start() + 1
|
||
result["事对"] = rest_after_shiduì[:split_idx]
|
||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||
else:
|
||
result["事对"] = rest_after_shiduì
|
||
else:
|
||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||
if shiwen_match:
|
||
split_idx = shiwen_match.start() + 1
|
||
result["叙事"] = rest_after_narrative[:split_idx]
|
||
result["诗文"] = rest_after_narrative[split_idx:]
|
||
else:
|
||
result["叙事"] = rest_after_narrative
|
||
|
||
for k in result:
|
||
result[k] = result[k].replace("〈", "(").replace("〉", ")")
|
||
|
||
return result
|
||
|
||
|
||
events = parse_html(
|
||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||
)
|
||
|
||
entries = []
|
||
current_entry_text = []
|
||
|
||
for ev_type, text in events:
|
||
if ev_type == "section":
|
||
pass
|
||
else:
|
||
if "〈叙事〉" in text or "〈敘事〉" in text:
|
||
if current_entry_text:
|
||
full_text = "".join(current_entry_text)
|
||
match = re.search(
|
||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
|
||
full_text,
|
||
)
|
||
if match:
|
||
entry_name = match.group(1).strip()
|
||
sections = extract_sections(full_text)
|
||
entries.append((entry_name, sections))
|
||
current_entry_text = [text]
|
||
else:
|
||
current_entry_text.append(text)
|
||
|
||
if current_entry_text:
|
||
full_text = "".join(current_entry_text)
|
||
match = re.search(
|
||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
|
||
)
|
||
if match:
|
||
entry_name = match.group(1).strip()
|
||
sections = extract_sections(full_text)
|
||
entries.append((entry_name, sections))
|
||
|
||
for entry_name, sections in entries[:3]:
|
||
print("Entry:", entry_name)
|
||
print("叙事:", sections["叙事"][:50])
|
||
print("事对:", sections["事对"][:50])
|
||
print("诗文:", sections["诗文"][:50])
|
||
print("-" * 20)
|