pg - AstroScent

page_text = (
df.groupby(“page_number”)[“text”]
.apply(lambda x: ” “.join(x))
)

MAX_TOKENS = 450
OVERLAP = 40

chunks = []
current = “”
prev_tail = “”

for para in paragraphs:
if token_len(current + para) > MAX_TOKENS:
chunks.append(prev_tail + current)
prev_tail = get_last_tokens(current, OVERLAP)
current = para
else:
current += ” ” + para

if current:
chunks.append(prev_tail + current)

pip install pandas tiktoken
\

import pandas as pd
import re
import tiktoken

# =====================
# 설정값
# =====================
CSV_PATH = "input.csv"
MODEL_NAME = "gpt-4o-mini"   # 토큰 계산용
MAX_TOKENS = 450
OVERLAP_TOKENS = 40

enc = tiktoken.encoding_for_model(MODEL_NAME)


# =====================
# 유틸 함수
# =====================
def token_len(text: str) -> int:
    return len(enc.encode(text))


def get_last_tokens(text: str, n: int) -> str:
    tokens = enc.encode(text)
    tail = tokens[-n:] if len(tokens) > n else tokens
    return enc.decode(tail)


def clean_text(text: str) -> str:
    text = text.strip()
    # OCR 페이지/목차 찌꺼기 제거
    if re.fullmatch(r"-\s*[ivxIVX]+\s*-", text):
        return ""
    if re.fullmatch(r"\d{4}\.\s*\d{1,2}\.\s*\d{1,2}\.", text):
        return ""
    if len(text) <= 1:
        return ""
    return text


def split_paragraphs(text: str):
    """
    OCR 문서용 문단 분리 규칙
    """
    separators = [
        r"\n\s*\n",
        r"(?<=:)\s+",
        r"(?<=\.)\s{2,}",
        r"(?<=다\.)\s+",
        r"(?<=\d\.)\s+",
        r"(?<=[가-하]\.)\s+",
    ]
    pattern = "|".join(separators)
    parts = re.split(pattern, text)
    return [p.strip() for p in parts if p.strip()]


# =====================
# 1. CSV 로드
# =====================
df = pd.read_csv(CSV_PATH)

# =====================
# 2. 페이지 단위 묶기
# =====================
page_texts = (
    df.sort_values(["page_number", "element_id"])
      .groupby("page_number")["text"]
      .apply(lambda x: " ".join(filter(None, map(clean_text, x))))
      .to_dict()
)

# =====================
# 3. 문단 분리
# =====================
paragraphs = []
for page, text in page_texts.items():
    for para in split_paragraphs(text):
        paragraphs.append({
            "page": page,
            "text": para
        })

# =====================
# 4. Chunking + Overlap
# =====================
chunks = []
current = ""
current_pages = set()
prev_tail = ""

for p in paragraphs:
    para_text = p["text"]
    para_tokens = token_len(para_text)

    if para_tokens > MAX_TOKENS:
        continue  # 너무 긴 문단은 버림 (OCR 깨진 경우)

    if token_len(current + " " + para_text) > MAX_TOKENS:
        chunks.append({
            "text": (prev_tail + " " + current).strip(),
            "pages": sorted(current_pages),
            "tokens": token_len(prev_tail + " " + current)
        })
        prev_tail = get_last_tokens(current, OVERLAP_TOKENS)
        current = para_text
        current_pages = {p["page"]}
    else:
        current += " " + para_text
        current_pages.add(p["page"])

if current:
    chunks.append({
        "text": (prev_tail + " " + current).strip(),
        "pages": sorted(current_pages),
        "tokens": token_len(prev_tail + " " + current)
    })

# =====================
# 5. 결과 저장
# =====================
out_df = pd.DataFrame(chunks)
out_df.to_json("chunks.json", orient="records", ensure_ascii=False, indent=2)

print(f"완료: {len(out_df)} chunks 생성")

You are given ONE text chunk.
DO NOT split, merge, or rewrite it.

Return ONLY a JSON with:
- type (objective/method/result/background/etc)
- short_topic (5 words max)
- keywords (max 6)

Text:
"""
{{CHUNK_TEXT}}
"""

LLM 프롬프트 (초경량, 핵심)

You are given two consecutive text elements from a document.

Question:
Do these two elements belong to the SAME semantic unit?

Answer ONLY one of:
- YES
- NO

Element A:
"{{text_a}}"

Element B:
"{{text_b}}"

chunks = []
current_chunk = [elements[0]]

for i in range(len(elements) - 1):
    a = elements[i]
    b = elements[i + 1]

    if rule_says_yes(a, b):
        current_chunk.append(b)
    elif rule_says_no(a, b):
        chunks.append(current_chunk)
        current_chunk = [b]
    else:
        if llm_says_yes(a, b):
            current_chunk.append(b)
        else:
            chunks.append(current_chunk)
            current_chunk = [b]

chunks.append(current_chunk)

import pandas as pd
import re
from typing import List

# ---------------------
# 설정
# ---------------------
CSV_PATH = "input.csv"

# LLM 사용 여부
USE_LLM = True

# page 바뀌면 기본적으로 끊음
BREAK_ON_PAGE_CHANGE = True
|



def is_heading(text: str) -> bool:
    return (
        text.endswith(":") or
        len(text) < 20 and not text.endswith(".")
    )


def rule_connect(a: dict, b: dict) -> bool | None:
    """
    return:
      True  -> 무조건 연결
      False -> 무조건 분리
      None  -> LLM 판단 필요
    """

    ta, tb = a["text"], b["text"]

    # 1. page 변경
    if a["page"] != b["page"]:
        return False

    # 2. 제목 + 설명
    if is_heading(ta):
        return True

    # 3. 문장 중간 끊김
    if not ta.endswith((".", "다.", "니다.")):
        return True

    # 4. 번호 목록 continuation
    if re.match(r"^\(?\d+[\.\)]", tb):
        return True

    # 애매
    return None

def call_llm_yes_no(text_a: str, text_b: str) -> bool:
    """
    반드시 YES / NO만 반환하도록 구성
    """

    prompt = f"""
You are given two consecutive text elements from a document.

Do they belong to the SAME semantic unit?

Answer ONLY:
YES or NO

Element A:
\"\"\"{text_a}\"\"\"

Element B:
\"\"\"{text_b}\"\"\"
""".strip()

    # -------------------------
    # 🔥 여기에 네 LLM 호출 코드
    # -------------------------
    response = YOUR_LLM_CALL(prompt)

    return response.strip().upper() == "YES"

def semantic_chunk(elements: List[dict]) -> List[List[dict]]:
    chunks = []
    current = [elements[0]]

    for i in range(len(elements) - 1):
        a = elements[i]
        b = elements[i + 1]

        decision = rule_connect(a, b)

        if decision is True:
            current.append(b)

        elif decision is False:
            chunks.append(current)
            current = [b]

        else:
            if USE_LLM and call_llm_yes_no(a["text"], b["text"]):
                current.append(b)
            else:
                chunks.append(current)
                current = [b]

    chunks.append(current)
    return chunks

df = pd.read_csv(CSV_PATH)

elements = [
    {
        "page": row.page_number,
        "element": row.element_id,
        "text": str(row.text).strip()
    }
    for _, row in df.sort_values(
        ["page_number", "element_id"]
    ).iterrows()
    if isinstance(row.text, str) and row.text.strip()
]

semantic_chunks = semantic_chunk(elements)

final_chunks = []

for idx, chunk in enumerate(semantic_chunks):
    final_chunks.append({
        "chunk_id": idx,
        "pages": sorted(set(e["page"] for e in chunk)),
        "elements": [e["element"] for e in chunk],
        "text": " ".join(e["text"] for e in chunk)
    })

out_df = pd.DataFrame(final_chunks)
out_df.to_json(
    "semantic_chunks.json",
    orient="records",
    ensure_ascii=False,
    indent=2
)

print(f"생성 완료: {len(final_chunks)} semantic chunks")

AstroScent

pg

Leave a Reply Cancel reply

Comments

Archives

Categories