spider-ctext/初学记/test_extract3.py

import bs4
import os
import re

html_dir = "epub_extracted/OPS"


def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    poem_divs = soup.find_all("div", class_="poem")

    events = []
    for div in poem_divs:
        for p in div.find_all("p"):
            current_line = []
            for child in p.children:
                if child.name == "br":
                    if current_line:
                        events.append(("line", "".join(current_line).strip()))
                        current_line = []
                elif (
                    child.name == "span"
                    and child.get("id")
                    and child.text.strip().endswith("部")
                ):
                    events.append(("section", child.text.strip()))
                elif child.name == "small":
                    small_text = child.get_text()
                    if not small_text.startswith("〈") and not small_text.startswith(
                        "（"
                    ):
                        current_line.append(f"〈{small_text}〉")
                    else:
                        current_line.append(small_text)
                else:
                    current_line.append(child.get_text())
            if current_line:
                events.append(("line", "".join(current_line).strip()))
    return events


def extract_sections(entry_text):
    result = {"叙事": "", "事对": "", "诗文": ""}

    narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
    if not narrative_match:
        return result

    rest_after_narrative = narrative_match.group(2)

    shìduì_start = rest_after_narrative.find("事對")
    if shìduì_start == -1:
        shìduì_start = rest_after_narrative.find("事对")

    if shìduì_start != -1:
        result["叙事"] = rest_after_narrative[:shìduì_start]
        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]

        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
        shiwen_match = re.search(genre_pattern, rest_after_shiduì)

        if shiwen_match:
            split_idx = shiwen_match.start() + 1
            result["事对"] = rest_after_shiduì[:split_idx]
            result["诗文"] = rest_after_shiduì[split_idx:]
        else:
            result["事对"] = rest_after_shiduì
    else:
        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
        shiwen_match = re.search(genre_pattern, rest_after_narrative)
        if shiwen_match:
            split_idx = shiwen_match.start() + 1
            result["叙事"] = rest_after_narrative[:split_idx]
            result["诗文"] = rest_after_narrative[split_idx:]
        else:
            result["叙事"] = rest_after_narrative

    for k in result:
        result[k] = result[k].replace("〈", "（").replace("〉", "）")

    return result


events = parse_html(
    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)

entries = []
current_entry_text = []

for ev_type, text in events:
    if ev_type == "section":
        pass
    else:
        if "〈叙事〉" in text or "〈敘事〉" in text:
            if current_entry_text:
                full_text = "".join(current_entry_text)
                match = re.search(
                    r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
                    full_text,
                )
                if match:
                    entry_name = match.group(1).strip()
                    sections = extract_sections(full_text)
                    entries.append((entry_name, sections))
            current_entry_text = [text]
        else:
            current_entry_text.append(text)

if current_entry_text:
    full_text = "".join(current_entry_text)
    match = re.search(
        r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
    )
    if match:
        entry_name = match.group(1).strip()
        sections = extract_sections(full_text)
        entries.append((entry_name, sections))

for entry_name, sections in entries[:3]:
    print("Entry:", entry_name)
    print("叙事:", sections["叙事"][:50])
    print("事对:", sections["事对"][:50])
    print("诗文:", sections["诗文"][:50])
    print("-" * 20)