66 - AstroScent

"""
Hybrid Rule + LLM Semantic Chunking
-----------------------------------
CSV 구조:
page_number, element_id, text

전략:
1. element 단위를 최소 원자 단위로 사용
2. 앞 element와 뒤 element가 의미적으로 이어지는지 판단
3. 규칙으로 1차 판별
4. 애매할 때만 LLM에게 YES/NO 질문
5. semantic chunk 생성

주의:
- LLM은 반드시 YES/NO만 반환하도록 구성
- 한 번에 두 element만 비교
"""

import pandas as pd
import re
from typing import List

# =========================
# 설정
# =========================

CSV_PATH = "input.csv"
OUTPUT_PATH = "semantic_chunks.json"

USE_LLM = True                 # LLM 사용 여부
BREAK_ON_PAGE_CHANGE = True    # 페이지 바뀌면 기본 분리


# =========================
# LLM 호출 함수 (수정 필요)
# =========================

def YOUR_LLM_CALL(prompt: str) -> str:
    """
    🔥 여기를 네 LLM 호출 코드로 교체
    반드시 "YES" 또는 "NO"만 반환하도록 구성할 것
    """
    raise NotImplementedError("LLM 호출 코드를 여기에 넣으세요.")


def call_llm_yes_no(text_a: str, text_b: str) -> bool:
    """
    두 element가 같은 semantic unit인지 판단
    YES/NO만 반환
    """
    prompt = f"""
You are given two consecutive text elements from a document.

Do they belong to the SAME semantic unit?

Answer ONLY:
YES or NO

Element A:
\"\"\"{text_a}\"\"\"

Element B:
\"\"\"{text_b}\"\"\"
""".strip()

    response = YOUR_LLM_CALL(prompt)
    return response.strip().upper() == "YES"


# =========================
# 규칙 기반 1차 판단
# =========================

def is_heading(text: str) -> bool:
    """
    간단한 제목 판단 규칙
    """
    return (
        text.endswith(":") or
        (len(text) < 25 and not text.endswith("."))
    )


def rule_connect(a: dict, b: dict) -> bool | None:
    """
    return:
      True  -> 연결 확정
      False -> 분리 확정
      None  -> LLM 판단 필요
    """

    ta, tb = a["text"], b["text"]

    # 1. 페이지 변경 시 기본 분리
    if BREAK_ON_PAGE_CHANGE and a["page"] != b["page"]:
        return False

    # 2. 제목 + 설명 구조
    if is_heading(ta):
        return True

    # 3. 문장 중간에서 끊긴 경우
    if not ta.endswith((".", "다.", "니다.")):
        return True

    # 4. 숫자 목록 continuation
    if re.match(r"^\(?\d+[\.\)]", tb):
        return True

    # 5. 매우 짧은 단독 줄 (OCR 깨짐 가능)
    if len(ta) < 10:
        return True

    # 애매한 경우
    return None


# =========================
# Semantic Chunking 본체
# =========================

def semantic_chunk(elements: List[dict]) -> List[List[dict]]:
    """
    element 리스트를 semantic chunk 리스트로 변환
    """
    if not elements:
        return []

    chunks = []
    current = [elements[0]]

    for i in range(len(elements) - 1):
        a = elements[i]
        b = elements[i + 1]

        decision = rule_connect(a, b)

        if decision is True:
            current.append(b)

        elif decision is False:
            chunks.append(current)
            current = [b]

        else:
            # 애매하면 LLM 판단
            if USE_LLM and call_llm_yes_no(a["text"], b["text"]):
                current.append(b)
            else:
                chunks.append(current)
                current = [b]

    chunks.append(current)
    return chunks


# =========================
# 실행부
# =========================

def main():
    # CSV 로드 및 정렬
    df = pd.read_csv(CSV_PATH)

    df = df.sort_values(["page_number", "element_id"])

    # element 리스트 생성
    elements = [
        {
            "page": row.page_number,
            "element": row.element_id,
            "text": str(row.text).strip()
        }
        for _, row in df.iterrows()
        if isinstance(row.text, str) and row.text.strip()
    ]

    # semantic chunk 생성
    semantic_chunks = semantic_chunk(elements)

    # RAG 친화적 구조로 변환
    final_chunks = []

    for idx, chunk in enumerate(semantic_chunks):
        final_chunks.append({
            "chunk_id": idx,
            "pages": sorted(set(e["page"] for e in chunk)),
            "elements": [e["element"] for e in chunk],
            "text": " ".join(e["text"] for e in chunk)
        })

    # JSON 저장
    pd.DataFrame(final_chunks).to_json(
        OUTPUT_PATH,
        orient="records",
        ensure_ascii=False,
        indent=2
    )

    print(f"완료: {len(final_chunks)} semantic chunks 생성")


if __name__ == "__main__":
    main()

ㄴㄴㄴㄴㄴ

"""
Palantir Foundry AIP LLM 사용 예제
---------------------------------
기능:
1. 사용 가능한 모델 목록 조회
2. 특정 모델 선택
3. YES/NO 추론 테스트
4. semantic element 비교 함수

Foundry Code Workbook / Jupyter 환경에서 실행
"""

# ============================================================
# 1️⃣ 모델 목록 조회
# ============================================================

from palantir_models import Model, list_models

print("=== 사용 가능한 모델 목록 ===")

models = list_models()

for m in models:
    print(m)

# ============================================================
# 2️⃣ 모델 선택
# ============================================================

# 🔥 위에서 출력된 모델 중 하나를 여기에 입력
MODEL_PATH = "ri.models.main.oss-low-model"

model = Model(MODEL_PATH)

print(f"\n선택된 모델: {MODEL_PATH}")


# ============================================================
# 3️⃣ 기본 추론 테스트
# ============================================================

test_prompt = """
You are a classifier.

Answer ONLY:
YES or NO

Question:
Is the sky blue?
"""

response = model.generate(
    prompt=test_prompt,
    max_tokens=5,      # 반드시 작게
    temperature=0,     # deterministic
)

print("\n=== 기본 추론 테스트 결과 ===")
print(response)


# ============================================================
# 4️⃣ Semantic 연결 판단 함수
# ============================================================

def llm_yes_no(text_a, text_b):
    """
    두 element가 같은 semantic unit인지 판단
    반드시 YES/NO만 반환하도록 구성
    """

    prompt = f"""
You are given two consecutive text elements from a document.

Do they belong to the SAME semantic unit?

Answer ONLY:
YES or NO

Element A:
\"\"\"{text_a}\"\"\"

Element B:
\"\"\"{text_b}\"\"\"
""".strip()

    response = model.generate(
        prompt=prompt,
        max_tokens=5,
        temperature=0,
        stop=["\n"]   # 불필요한 출력 방지
    )

    return response.strip().upper()


# ============================================================
# 5️⃣ 연결 판단 테스트
# ============================================================

a = "촉매를 이용한 VOC 제거 기술"
b = "본 기술은 산업 현장에서 발생하는 악취를 처리한다."

result = llm_yes_no(a, b)

print("\n=== Semantic 연결 판단 결과 ===")
print(result)


# ============================================================
# 6️⃣ (선택) element 리스트에 적용 예제
# ============================================================

elements = [
    {"text": "촉매를 이용한 VOC 제거 기술"},
    {"text": "본 기술은 산업 현장에서 발생하는 악취를 처리한다."},
    {"text": "실험 결과 제거율은 95% 이상이었다."}
]

print("\n=== element 간 연결 테스트 ===")

for i in range(len(elements) - 1):
    decision = llm_yes_no(elements[i]["text"], elements[i+1]["text"])
    print(f"{i} -> {i+1} : {decision}")

from palantir_models import Model

# 모델 경로를 직접 알고 있어야 함
model = Model("ri.models.main.oss-low-model")
print("모델 로드 완료")

AstroScent

66

Leave a Reply Cancel reply

Comments

Archives

Categories