xssdd
02/12/2026
# ============================================================
# 0. 필수 설치 (최초 1회)
# ============================================================
# !maestro env conda install "palantir_models>=0.2129.0"
# !pip install pandas openpyxl
# ============================================================
# 1. 모델 import
# ============================================================
from language_model_service_api.languagemodelservice_api_completion_v3 import GptChatCompletionRequest
from language_model_service_api.languagemodelservice_api import ChatMessage, ChatMessageRole
from palantir_models.models import OpenAiGptChatLanguageModel
import pandas as pd
# ============================================================
# 2. GPT-4.1-mini 모델 로드
# ============================================================
model = OpenAiGptChatLanguageModel.get("GPT_4_1_MINI")
# ============================================================
# 3. 엑셀 파일 로드
# ============================================================
file_path = "test.xlsx" # ← 파일명 수정
df = pd.read_excel(file_path, engine="openpyxl")
# 모든 셀을 문자열로 변환 후 하나의 텍스트로 합치기
full_text = "\n".join(df.astype(str).fillna("").values.flatten())
# ============================================================
# 4. LLM 기반 Semantic Chunking 프롬프트
# ============================================================
system_prompt = """
You are an expert NLP document segmenter.
Your task:
- Read the given document.
- Compare meaning between sentences.
- Automatically group semantically similar sentences.
- Split into clean logical paragraphs.
- Remove broken lines and noise.
- Do NOT summarize.
- Do NOT change wording.
- Only reorganize into meaningful paragraphs.
Return format:
Each chunk separated by:
<CHUNK>
"""
user_prompt = f"""
Document:
{full_text}
"""
# ============================================================
# 5. GPT 호출
# ============================================================
response = model.create_chat_completion(
GptChatCompletionRequest(
[
ChatMessage(ChatMessageRole.SYSTEM, system_prompt),
ChatMessage(ChatMessageRole.USER, user_prompt),
],
temperature=0.0,
max_tokens=4000
)
)
result_text = response.choices[0].message.content
# ============================================================
# 6. Chunk 리스트로 변환
# ============================================================
chunks = [c.strip() for c in result_text.split("<CHUNK>") if c.strip()]
print(f"\n총 {len(chunks)}개 semantic chunks 생성\n")
for i, chunk in enumerate(chunks):
print(f"\n========== CHUNK {i+1} ==========\n")
print(chunk)
full_text = full_text[:20000] # 테스트용 자르기