xssdd

# ============================================================
# 0. 필수 설치 (최초 1회)
# ============================================================
# !maestro env conda install "palantir_models>=0.2129.0"
# !pip install pandas openpyxl

# ============================================================
# 1. 모델 import
# ============================================================
from language_model_service_api.languagemodelservice_api_completion_v3 import GptChatCompletionRequest
from language_model_service_api.languagemodelservice_api import ChatMessage, ChatMessageRole
from palantir_models.models import OpenAiGptChatLanguageModel

import pandas as pd

# ============================================================
# 2. GPT-4.1-mini 모델 로드
# ============================================================
model = OpenAiGptChatLanguageModel.get("GPT_4_1_MINI")

# ============================================================
# 3. 엑셀 파일 로드
# ============================================================
file_path = "test.xlsx"   # ← 파일명 수정

df = pd.read_excel(file_path, engine="openpyxl")

# 모든 셀을 문자열로 변환 후 하나의 텍스트로 합치기
full_text = "\n".join(df.astype(str).fillna("").values.flatten())

# ============================================================
# 4. LLM 기반 Semantic Chunking 프롬프트
# ============================================================
system_prompt = """
You are an expert NLP document segmenter.

Your task:
- Read the given document.
- Compare meaning between sentences.
- Automatically group semantically similar sentences.
- Split into clean logical paragraphs.
- Remove broken lines and noise.
- Do NOT summarize.
- Do NOT change wording.
- Only reorganize into meaningful paragraphs.

Return format:
Each chunk separated by:
<CHUNK>
"""

user_prompt = f"""
Document:

{full_text}
"""

# ============================================================
# 5. GPT 호출
# ============================================================
response = model.create_chat_completion(
    GptChatCompletionRequest(
        [
            ChatMessage(ChatMessageRole.SYSTEM, system_prompt),
            ChatMessage(ChatMessageRole.USER, user_prompt),
        ],
        temperature=0.0,
        max_tokens=4000
    )
)

result_text = response.choices[0].message.content

# ============================================================
# 6. Chunk 리스트로 변환
# ============================================================
chunks = [c.strip() for c in result_text.split("<CHUNK>") if c.strip()]

print(f"\n총 {len(chunks)}개 semantic chunks 생성\n")

for i, chunk in enumerate(chunks):
    print(f"\n========== CHUNK {i+1} ==========\n")
    print(chunk)
full_text = full_text[:20000] # 테스트용 자르기
AstroScent

Leave a Reply Cancel reply

Comments

Archives

Categories