02/18/2026
OCR JSON
↓
Flatten CSV
↓
Rule-based CSV (block_id, page_number, element_ids, merged_text)
↓
Token-safe chunk CSV ← 지금 만들 단계
↓
LLM 처리
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;
/*
###############################################
# Rule 결과 CSV → Token-safe Chunk CSV 생성
#
# 입력: rule_result.csv
# (block_id,page_number,element_ids,merged_text)
#
# 출력: token_chunks.csv
# (chunk_id,block_id,page_number,start_char,end_char,chunk_text)
#
# 목적:
# 1. Rule-based semantic 결과를 중간 백업 유지
# 2. LLM 투입 전 token-safe 분할
# 3. overlap 적용
# 4. 디버깅/재조립 가능하도록 start/end 보존
###############################################
*/
public class RuleToTokenChunk {
/*
###############################################
# RuleBlock
# rule-based 1차 semantic 분리 결과 구조
###############################################
*/
static class RuleBlock {
int blockId;
int pageNumber;
String mergedText;
public RuleBlock(int blockId, int pageNumber, String mergedText) {
this.blockId = blockId;
this.pageNumber = pageNumber;
this.mergedText = mergedText;
}
}
/*
###############################################
# TokenChunk
# LLM 투입용 chunk 구조
###############################################
*/
static class TokenChunk {
int chunkId;
int blockId;
int pageNumber;
int startChar;
int endChar;
String text;
public TokenChunk(int chunkId, int blockId, int pageNumber,
int startChar, int endChar, String text) {
this.chunkId = chunkId;
this.blockId = blockId;
this.pageNumber = pageNumber;
this.startChar = startChar;
this.endChar = endChar;
this.text = text;
}
}
/*
###############################################
# 1️⃣ Rule CSV 읽기
# rule_result.csv → List<RuleBlock>
###############################################
*/
private static List<RuleBlock> readRuleCsv(Path path) throws IOException {
List<RuleBlock> list = new ArrayList<>();
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
br.readLine(); // header skip
String line;
while ((line = br.readLine()) != null) {
// block_id,page_number,element_ids,merged_text
String[] parts = line.split(",", 4);
if (parts.length < 4) continue;
int blockId = Integer.parseInt(parts[0].trim());
int pageNumber = Integer.parseInt(parts[1].trim());
// 따옴표 제거
String mergedText = parts[3]
.replace("\"", "")
.trim();
list.add(new RuleBlock(blockId, pageNumber, mergedText));
}
}
return list;
}
/*
###############################################
# 2️⃣ Token-safe Chunk 분할
#
# chunkSize: 1200~1500 권장
# overlap: 100~150 권장
#
# 현재는 문자 기준 분할
# (추후 토큰 라이브러리로 교체 가능)
###############################################
*/
private static List<TokenChunk> chunkBlocks(
List<RuleBlock> blocks,
int chunkSize,
int overlap
) {
List<TokenChunk> chunks = new ArrayList<>();
int chunkCounter = 0;
for (RuleBlock block : blocks) {
String text = block.mergedText;
int start = 0;
while (start < text.length()) {
int end = Math.min(start + chunkSize, text.length());
String chunkText = text.substring(start, end);
chunks.add(new TokenChunk(
chunkCounter++,
block.blockId,
block.pageNumber,
start,
end,
chunkText
));
// overlap 적용
start = end - overlap;
if (start < 0) start = 0;
}
}
return chunks;
}
/*
###############################################
# 3️⃣ Chunk CSV 저장
#
# 디버깅 가능하도록
# start_char / end_char 유지
###############################################
*/
private static void saveChunksToCsv(
List<TokenChunk> chunks,
Path outputPath
) throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(
outputPath,
StandardCharsets.UTF_8,
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING)) {
writer.write("chunk_id,block_id,page_number,start_char,end_char,chunk_text");
writer.newLine();
for (TokenChunk chunk : chunks) {
String safeText = chunk.text
.replace("\"", "\"\"")
.replace("\n", " ")
.replace("\r", " ");
writer.write(
chunk.chunkId + "," +
chunk.blockId + "," +
chunk.pageNumber + "," +
chunk.startChar + "," +
chunk.endChar + ",\"" +
safeText + "\""
);
writer.newLine();
}
}
}
/*
###############################################
# 4️⃣ 실행 메인
###############################################
*/
public static void main(String[] args) throws Exception {
# 입력 Rule 결과 CSV
Path ruleCsv = Paths.get("rule_result.csv");
# 출력 Token Chunk CSV
Path chunkCsv = Paths.get("token_chunks.csv");
# 1. Rule 결과 읽기
List<RuleBlock> blocks = readRuleCsv(ruleCsv);
# 2. Token-safe 분할
List<TokenChunk> chunks = chunkBlocks(
blocks,
1200, # chunk size
120 # overlap
);
# 3. CSV 저장
saveChunksToCsv(chunks, chunkCsv);
System.out.println("완료: 생성된 chunk 수 = " + chunks.size());
}
}
import java.io.*;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.time.Duration;
import java.util.*;
/*
###########################################################
# Token Chunk CSV → LLM 호출 → 결과 CSV 저장
#
# 입력:
# token_chunks.csv
# (chunk_id,block_id,page_number,start_char,end_char,chunk_text)
#
# 출력:
# llm_results.csv
# (chunk_id,block_id,page_number,llm_output)
#
# 기능:
# 1. chunk_text를 LLM에 전송
# 2. 응답 저장
# 3. 중간 결과 백업 가능
###########################################################
*/
public class ChunkToLLM {
# OpenAI API Key (환경변수로 설정 권장)
private static final String API_KEY = System.getenv("OPENAI_API_KEY");
# 사용할 모델
private static final String MODEL = "gpt-4o-mini";
/*
###########################################################
# TokenChunk 구조
###########################################################
*/
static class TokenChunk {
int chunkId;
int blockId;
int pageNumber;
String text;
public TokenChunk(int chunkId, int blockId, int pageNumber, String text) {
this.chunkId = chunkId;
this.blockId = blockId;
this.pageNumber = pageNumber;
this.text = text;
}
}
/*
###########################################################
# 1️⃣ token_chunks.csv 읽기
###########################################################
*/
private static List<TokenChunk> readChunkCsv(Path path) throws IOException {
List<TokenChunk> list = new ArrayList<>();
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
br.readLine(); # header skip
String line;
while ((line = br.readLine()) != null) {
String[] parts = line.split(",", 6);
if (parts.length < 6) continue;
int chunkId = Integer.parseInt(parts[0].trim());
int blockId = Integer.parseInt(parts[1].trim());
int pageNumber = Integer.parseInt(parts[2].trim());
String text = parts[5].replace("\"", "");
list.add(new TokenChunk(chunkId, blockId, pageNumber, text));
}
}
return list;
}
/*
###########################################################
# 2️⃣ LLM 호출
###########################################################
*/
private static String callLLM(String inputText) throws Exception {
HttpClient client = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(30))
.build();
# 프롬프트 구성
String prompt = """
다음 텍스트를 간결하게 요약하라.
""" + inputText;
String requestBody = """
{
"model": "%s",
"messages": [
{"role": "system", "content": "You are a precise summarizer."},
{"role": "user", "content": "%s"}
],
"max_tokens": 300
}
""".formatted(MODEL, prompt.replace("\"", "\\\""));
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.openai.com/v1/chat/completions"))
.timeout(Duration.ofSeconds(60))
.header("Content-Type", "application/json")
.header("Authorization", "Bearer " + API_KEY)
.POST(HttpRequest.BodyPublishers.ofString(requestBody))
.build();
HttpResponse<String> response =
client.send(request, HttpResponse.BodyHandlers.ofString());
String body = response.body();
# 응답 파싱 (간단 문자열 추출)
int start = body.indexOf("\"content\":\"");
if (start == -1) return "ERROR";
start += 11;
int end = body.indexOf("\"", start);
if (end == -1) return "ERROR";
return body.substring(start, end)
.replace("\\n", " ")
.replace("\\\"", "\"");
}
/*
###########################################################
# 3️⃣ 결과 CSV 저장
###########################################################
*/
private static void saveResults(
List<TokenChunk> chunks,
List<String> outputs,
Path outputPath
) throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(
outputPath,
StandardCharsets.UTF_8,
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING)) {
writer.write("chunk_id,block_id,page_number,llm_output");
writer.newLine();
for (int i = 0; i < chunks.size(); i++) {
TokenChunk c = chunks.get(i);
String output = outputs.get(i)
.replace("\"", "\"\"");
writer.write(
c.chunkId + "," +
c.blockId + "," +
c.pageNumber + ",\"" +
output + "\""
);
writer.newLine();
}
}
}
/*
###########################################################
# 4️⃣ 실행 메인
###########################################################
*/
public static void main(String[] args) throws Exception {
Path chunkCsv = Paths.get("token_chunks.csv");
Path resultCsv = Paths.get("llm_results.csv");
List<TokenChunk> chunks = readChunkCsv(chunkCsv);
List<String> outputs = new ArrayList<>();
for (TokenChunk chunk : chunks) {
System.out.println("LLM 처리 중: chunk " + chunk.chunkId);
String result = callLLM(chunk.text);
outputs.add(result);
# API 과부하 방지
Thread.sleep(500);
}
saveResults(chunks, outputs, resultCsv);
System.out.println("완료: LLM 결과 저장 완료");
}
}
OCR JSON
↓
Flatten CSV
↓
Rule-based CSV
↓
Token-safe Chunk CSV
↓
LLM 결과 CSV ← 지금 코드