import bs4 import os import re html_dir = "epub_extracted/OPS" def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") poem_divs = soup.find_all("div", class_="poem") events = [] for div in poem_divs: for p in div.find_all("p"): current_line = [] for child in p.children: if child.name == "br": if current_line: events.append(("line", "".join(current_line).strip())) current_line = [] elif ( child.name == "span" and child.get("id") and child.text.strip().endswith("部") ): events.append(("section", child.text.strip())) elif child.name == "small": small_text = child.get_text() if not small_text.startswith("〈") and not small_text.startswith( "(" ): current_line.append(f"〈{small_text}〉") else: current_line.append(small_text) else: current_line.append(child.get_text()) if current_line: events.append(("line", "".join(current_line).strip())) return events def extract_sections(entry_text): result = {"叙事": "", "事对": "", "诗文": ""} narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text) if not narrative_match: return result rest_after_narrative = narrative_match.group(2) shìduì_start = rest_after_narrative.find("事對") if shìduì_start == -1: shìduì_start = rest_after_narrative.find("事对") if shìduì_start != -1: result["叙事"] = rest_after_narrative[:shìduì_start] rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)" shiwen_match = re.search(genre_pattern, rest_after_shiduì) if shiwen_match: split_idx = shiwen_match.start() + 1 result["事对"] = rest_after_shiduì[:split_idx] result["诗文"] = rest_after_shiduì[split_idx:] else: result["事对"] = rest_after_shiduì else: genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)" shiwen_match = re.search(genre_pattern, rest_after_narrative) if shiwen_match: split_idx = shiwen_match.start() + 1 result["叙事"] = rest_after_narrative[:split_idx] result["诗文"] = rest_after_narrative[split_idx:] else: result["叙事"] = rest_after_narrative for k in result: result[k] = result[k].replace("〈", "(").replace("〉", ")") return result events = parse_html( os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") ) entries = [] current_entry_text = [] for ev_type, text in events: if ev_type == "section": pass else: if "〈叙事〉" in text or "〈敘事〉" in text: if current_entry_text: full_text = "".join(current_entry_text) match = re.search( r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text, ) if match: entry_name = match.group(1).strip() sections = extract_sections(full_text) entries.append((entry_name, sections)) current_entry_text = [text] else: current_entry_text.append(text) if current_entry_text: full_text = "".join(current_entry_text) match = re.search( r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text ) if match: entry_name = match.group(1).strip() sections = extract_sections(full_text) entries.append((entry_name, sections)) for entry_name, sections in entries[:3]: print("Entry:", entry_name) print("叙事:", sections["叙事"][:50]) print("事对:", sections["事对"][:50]) print("诗文:", sections["诗文"][:50]) print("-" * 20)