pg
02/10/2026
page_text = (
df.groupby(“page_number”)[“text”]
.apply(lambda x: ” “.join(x))
)
MAX_TOKENS = 450
OVERLAP = 40
chunks = []
current = “”
prev_tail = “”
for para in paragraphs:
if token_len(current + para) > MAX_TOKENS:
chunks.append(prev_tail + current)
prev_tail = get_last_tokens(current, OVERLAP)
current = para
else:
current += ” ” + para
if current:
chunks.append(prev_tail + current)
pip install pandas tiktoken
\
import pandas as pd
import re
import tiktoken
# =====================
# 설정값
# =====================
CSV_PATH = "input.csv"
MODEL_NAME = "gpt-4o-mini" # 토큰 계산용
MAX_TOKENS = 450
OVERLAP_TOKENS = 40
enc = tiktoken.encoding_for_model(MODEL_NAME)
# =====================
# 유틸 함수
# =====================
def token_len(text: str) -> int:
return len(enc.encode(text))
def get_last_tokens(text: str, n: int) -> str:
tokens = enc.encode(text)
tail = tokens[-n:] if len(tokens) > n else tokens
return enc.decode(tail)
def clean_text(text: str) -> str:
text = text.strip()
# OCR 페이지/목차 찌꺼기 제거
if re.fullmatch(r"-\s*[ivxIVX]+\s*-", text):
return ""
if re.fullmatch(r"\d{4}\.\s*\d{1,2}\.\s*\d{1,2}\.", text):
return ""
if len(text) <= 1:
return ""
return text
def split_paragraphs(text: str):
"""
OCR 문서용 문단 분리 규칙
"""
separators = [
r"\n\s*\n",
r"(?<=:)\s+",
r"(?<=\.)\s{2,}",
r"(?<=다\.)\s+",
r"(?<=\d\.)\s+",
r"(?<=[가-하]\.)\s+",
]
pattern = "|".join(separators)
parts = re.split(pattern, text)
return [p.strip() for p in parts if p.strip()]
# =====================
# 1. CSV 로드
# =====================
df = pd.read_csv(CSV_PATH)
# =====================
# 2. 페이지 단위 묶기
# =====================
page_texts = (
df.sort_values(["page_number", "element_id"])
.groupby("page_number")["text"]
.apply(lambda x: " ".join(filter(None, map(clean_text, x))))
.to_dict()
)
# =====================
# 3. 문단 분리
# =====================
paragraphs = []
for page, text in page_texts.items():
for para in split_paragraphs(text):
paragraphs.append({
"page": page,
"text": para
})
# =====================
# 4. Chunking + Overlap
# =====================
chunks = []
current = ""
current_pages = set()
prev_tail = ""
for p in paragraphs:
para_text = p["text"]
para_tokens = token_len(para_text)
if para_tokens > MAX_TOKENS:
continue # 너무 긴 문단은 버림 (OCR 깨진 경우)
if token_len(current + " " + para_text) > MAX_TOKENS:
chunks.append({
"text": (prev_tail + " " + current).strip(),
"pages": sorted(current_pages),
"tokens": token_len(prev_tail + " " + current)
})
prev_tail = get_last_tokens(current, OVERLAP_TOKENS)
current = para_text
current_pages = {p["page"]}
else:
current += " " + para_text
current_pages.add(p["page"])
if current:
chunks.append({
"text": (prev_tail + " " + current).strip(),
"pages": sorted(current_pages),
"tokens": token_len(prev_tail + " " + current)
})
# =====================
# 5. 결과 저장
# =====================
out_df = pd.DataFrame(chunks)
out_df.to_json("chunks.json", orient="records", ensure_ascii=False, indent=2)
print(f"완료: {len(out_df)} chunks 생성")
You are given ONE text chunk.
DO NOT split, merge, or rewrite it.
Return ONLY a JSON with:
- type (objective/method/result/background/etc)
- short_topic (5 words max)
- keywords (max 6)
Text:
"""
{{CHUNK_TEXT}}
"""
LLM 프롬프트 (초경량, 핵심)
You are given two consecutive text elements from a document.
Question:
Do these two elements belong to the SAME semantic unit?
Answer ONLY one of:
- YES
- NO
Element A:
"{{text_a}}"
Element B:
"{{text_b}}"
chunks = []
current_chunk = [elements[0]]
for i in range(len(elements) - 1):
a = elements[i]
b = elements[i + 1]
if rule_says_yes(a, b):
current_chunk.append(b)
elif rule_says_no(a, b):
chunks.append(current_chunk)
current_chunk = [b]
else:
if llm_says_yes(a, b):
current_chunk.append(b)
else:
chunks.append(current_chunk)
current_chunk = [b]
chunks.append(current_chunk)
import pandas as pd
import re
from typing import List
# ---------------------
# 설정
# ---------------------
CSV_PATH = "input.csv"
# LLM 사용 여부
USE_LLM = True
# page 바뀌면 기본적으로 끊음
BREAK_ON_PAGE_CHANGE = True
|
def is_heading(text: str) -> bool:
return (
text.endswith(":") or
len(text) < 20 and not text.endswith(".")
)
def rule_connect(a: dict, b: dict) -> bool | None:
"""
return:
True -> 무조건 연결
False -> 무조건 분리
None -> LLM 판단 필요
"""
ta, tb = a["text"], b["text"]
# 1. page 변경
if a["page"] != b["page"]:
return False
# 2. 제목 + 설명
if is_heading(ta):
return True
# 3. 문장 중간 끊김
if not ta.endswith((".", "다.", "니다.")):
return True
# 4. 번호 목록 continuation
if re.match(r"^\(?\d+[\.\)]", tb):
return True
# 애매
return None
def call_llm_yes_no(text_a: str, text_b: str) -> bool:
"""
반드시 YES / NO만 반환하도록 구성
"""
prompt = f"""
You are given two consecutive text elements from a document.
Do they belong to the SAME semantic unit?
Answer ONLY:
YES or NO
Element A:
\"\"\"{text_a}\"\"\"
Element B:
\"\"\"{text_b}\"\"\"
""".strip()
# -------------------------
# 🔥 여기에 네 LLM 호출 코드
# -------------------------
response = YOUR_LLM_CALL(prompt)
return response.strip().upper() == "YES"
def semantic_chunk(elements: List[dict]) -> List[List[dict]]:
chunks = []
current = [elements[0]]
for i in range(len(elements) - 1):
a = elements[i]
b = elements[i + 1]
decision = rule_connect(a, b)
if decision is True:
current.append(b)
elif decision is False:
chunks.append(current)
current = [b]
else:
if USE_LLM and call_llm_yes_no(a["text"], b["text"]):
current.append(b)
else:
chunks.append(current)
current = [b]
chunks.append(current)
return chunks
df = pd.read_csv(CSV_PATH)
elements = [
{
"page": row.page_number,
"element": row.element_id,
"text": str(row.text).strip()
}
for _, row in df.sort_values(
["page_number", "element_id"]
).iterrows()
if isinstance(row.text, str) and row.text.strip()
]
semantic_chunks = semantic_chunk(elements)
final_chunks = []
for idx, chunk in enumerate(semantic_chunks):
final_chunks.append({
"chunk_id": idx,
"pages": sorted(set(e["page"] for e in chunk)),
"elements": [e["element"] for e in chunk],
"text": " ".join(e["text"] for e in chunk)
})
out_df = pd.DataFrame(final_chunks)
out_df.to_json(
"semantic_chunks.json",
orient="records",
ensure_ascii=False,
indent=2
)
print(f"생성 완료: {len(final_chunks)} semantic chunks")