Files
spider-ctext/初学记/test_split.py
2026-03-22 16:18:35 +08:00

42 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
def extract_sections(content_text):
result = {"叙事": "", "事对": "", "诗文": ""}
rest_after_narrative = content_text
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "")
return result
import json
print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))