- AstroScent

OCR JSON
↓
Flatten CSV
↓
Rule-based CSV (block_id, page_number, element_ids, merged_text)
↓
Token-safe chunk CSV ← 지금 만들 단계
↓
LLM 처리

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;

/*
###############################################
# Rule 결과 CSV → Token-safe Chunk CSV 생성
#
# 입력:  rule_result.csv
#        (block_id,page_number,element_ids,merged_text)
#
# 출력:  token_chunks.csv
#        (chunk_id,block_id,page_number,start_char,end_char,chunk_text)
#
# 목적:
# 1. Rule-based semantic 결과를 중간 백업 유지
# 2. LLM 투입 전 token-safe 분할
# 3. overlap 적용
# 4. 디버깅/재조립 가능하도록 start/end 보존
###############################################
*/

public class RuleToTokenChunk {

    /*
    ###############################################
    # RuleBlock
    # rule-based 1차 semantic 분리 결과 구조
    ###############################################
    */
    static class RuleBlock {
        int blockId;
        int pageNumber;
        String mergedText;

        public RuleBlock(int blockId, int pageNumber, String mergedText) {
            this.blockId = blockId;
            this.pageNumber = pageNumber;
            this.mergedText = mergedText;
        }
    }

    /*
    ###############################################
    # TokenChunk
    # LLM 투입용 chunk 구조
    ###############################################
    */
    static class TokenChunk {
        int chunkId;
        int blockId;
        int pageNumber;
        int startChar;
        int endChar;
        String text;

        public TokenChunk(int chunkId, int blockId, int pageNumber,
                          int startChar, int endChar, String text) {
            this.chunkId = chunkId;
            this.blockId = blockId;
            this.pageNumber = pageNumber;
            this.startChar = startChar;
            this.endChar = endChar;
            this.text = text;
        }
    }

    /*
    ###############################################
    # 1️⃣ Rule CSV 읽기
    # rule_result.csv → List<RuleBlock>
    ###############################################
    */
    private static List<RuleBlock> readRuleCsv(Path path) throws IOException {

        List<RuleBlock> list = new ArrayList<>();

        try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {

            br.readLine(); // header skip

            String line;
            while ((line = br.readLine()) != null) {

                // block_id,page_number,element_ids,merged_text
                String[] parts = line.split(",", 4);
                if (parts.length < 4) continue;

                int blockId = Integer.parseInt(parts[0].trim());
                int pageNumber = Integer.parseInt(parts[1].trim());

                // 따옴표 제거
                String mergedText = parts[3]
                        .replace("\"", "")
                        .trim();

                list.add(new RuleBlock(blockId, pageNumber, mergedText));
            }
        }

        return list;
    }

    /*
    ###############################################
    # 2️⃣ Token-safe Chunk 분할
    #
    # chunkSize: 1200~1500 권장
    # overlap: 100~150 권장
    #
    # 현재는 문자 기준 분할
    # (추후 토큰 라이브러리로 교체 가능)
    ###############################################
    */
    private static List<TokenChunk> chunkBlocks(
            List<RuleBlock> blocks,
            int chunkSize,
            int overlap
    ) {

        List<TokenChunk> chunks = new ArrayList<>();
        int chunkCounter = 0;

        for (RuleBlock block : blocks) {

            String text = block.mergedText;

            int start = 0;

            while (start < text.length()) {

                int end = Math.min(start + chunkSize, text.length());

                String chunkText = text.substring(start, end);

                chunks.add(new TokenChunk(
                        chunkCounter++,
                        block.blockId,
                        block.pageNumber,
                        start,
                        end,
                        chunkText
                ));

                // overlap 적용
                start = end - overlap;

                if (start < 0) start = 0;
            }
        }

        return chunks;
    }

    /*
    ###############################################
    # 3️⃣ Chunk CSV 저장
    #
    # 디버깅 가능하도록
    # start_char / end_char 유지
    ###############################################
    */
    private static void saveChunksToCsv(
            List<TokenChunk> chunks,
            Path outputPath
    ) throws IOException {

        try (BufferedWriter writer = Files.newBufferedWriter(
                outputPath,
                StandardCharsets.UTF_8,
                StandardOpenOption.CREATE,
                StandardOpenOption.TRUNCATE_EXISTING)) {

            writer.write("chunk_id,block_id,page_number,start_char,end_char,chunk_text");
            writer.newLine();

            for (TokenChunk chunk : chunks) {

                String safeText = chunk.text
                        .replace("\"", "\"\"")
                        .replace("\n", " ")
                        .replace("\r", " ");

                writer.write(
                        chunk.chunkId + "," +
                        chunk.blockId + "," +
                        chunk.pageNumber + "," +
                        chunk.startChar + "," +
                        chunk.endChar + ",\"" +
                        safeText + "\""
                );

                writer.newLine();
            }
        }
    }

    /*
    ###############################################
    # 4️⃣ 실행 메인
    ###############################################
    */
    public static void main(String[] args) throws Exception {

        # 입력 Rule 결과 CSV
        Path ruleCsv = Paths.get("rule_result.csv");

        # 출력 Token Chunk CSV
        Path chunkCsv = Paths.get("token_chunks.csv");

        # 1. Rule 결과 읽기
        List<RuleBlock> blocks = readRuleCsv(ruleCsv);

        # 2. Token-safe 분할
        List<TokenChunk> chunks = chunkBlocks(
                blocks,
                1200,  # chunk size
                120    # overlap
        );

        # 3. CSV 저장
        saveChunksToCsv(chunks, chunkCsv);

        System.out.println("완료: 생성된 chunk 수 = " + chunks.size());
    }
}

import java.io.*;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.time.Duration;
import java.util.*;

/*
###########################################################
# Token Chunk CSV → LLM 호출 → 결과 CSV 저장
#
# 입력:
#   token_chunks.csv
#   (chunk_id,block_id,page_number,start_char,end_char,chunk_text)
#
# 출력:
#   llm_results.csv
#   (chunk_id,block_id,page_number,llm_output)
#
# 기능:
#   1. chunk_text를 LLM에 전송
#   2. 응답 저장
#   3. 중간 결과 백업 가능
###########################################################
*/

public class ChunkToLLM {

    # OpenAI API Key (환경변수로 설정 권장)
    private static final String API_KEY = System.getenv("OPENAI_API_KEY");

    # 사용할 모델
    private static final String MODEL = "gpt-4o-mini";

    /*
    ###########################################################
    # TokenChunk 구조
    ###########################################################
    */
    static class TokenChunk {
        int chunkId;
        int blockId;
        int pageNumber;
        String text;

        public TokenChunk(int chunkId, int blockId, int pageNumber, String text) {
            this.chunkId = chunkId;
            this.blockId = blockId;
            this.pageNumber = pageNumber;
            this.text = text;
        }
    }

    /*
    ###########################################################
    # 1️⃣ token_chunks.csv 읽기
    ###########################################################
    */
    private static List<TokenChunk> readChunkCsv(Path path) throws IOException {

        List<TokenChunk> list = new ArrayList<>();

        try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {

            br.readLine(); # header skip

            String line;
            while ((line = br.readLine()) != null) {

                String[] parts = line.split(",", 6);
                if (parts.length < 6) continue;

                int chunkId = Integer.parseInt(parts[0].trim());
                int blockId = Integer.parseInt(parts[1].trim());
                int pageNumber = Integer.parseInt(parts[2].trim());

                String text = parts[5].replace("\"", "");

                list.add(new TokenChunk(chunkId, blockId, pageNumber, text));
            }
        }

        return list;
    }

    /*
    ###########################################################
    # 2️⃣ LLM 호출
    ###########################################################
    */
    private static String callLLM(String inputText) throws Exception {

        HttpClient client = HttpClient.newBuilder()
                .connectTimeout(Duration.ofSeconds(30))
                .build();

        # 프롬프트 구성
        String prompt = """
        다음 텍스트를 간결하게 요약하라.

        """ + inputText;

        String requestBody = """
        {
          "model": "%s",
          "messages": [
            {"role": "system", "content": "You are a precise summarizer."},
            {"role": "user", "content": "%s"}
          ],
          "max_tokens": 300
        }
        """.formatted(MODEL, prompt.replace("\"", "\\\""));

        HttpRequest request = HttpRequest.newBuilder()
                .uri(URI.create("https://api.openai.com/v1/chat/completions"))
                .timeout(Duration.ofSeconds(60))
                .header("Content-Type", "application/json")
                .header("Authorization", "Bearer " + API_KEY)
                .POST(HttpRequest.BodyPublishers.ofString(requestBody))
                .build();

        HttpResponse<String> response =
                client.send(request, HttpResponse.BodyHandlers.ofString());

        String body = response.body();

        # 응답 파싱 (간단 문자열 추출)
        int start = body.indexOf("\"content\":\"");
        if (start == -1) return "ERROR";

        start += 11;
        int end = body.indexOf("\"", start);

        if (end == -1) return "ERROR";

        return body.substring(start, end)
                .replace("\\n", " ")
                .replace("\\\"", "\"");
    }

    /*
    ###########################################################
    # 3️⃣ 결과 CSV 저장
    ###########################################################
    */
    private static void saveResults(
            List<TokenChunk> chunks,
            List<String> outputs,
            Path outputPath
    ) throws IOException {

        try (BufferedWriter writer = Files.newBufferedWriter(
                outputPath,
                StandardCharsets.UTF_8,
                StandardOpenOption.CREATE,
                StandardOpenOption.TRUNCATE_EXISTING)) {

            writer.write("chunk_id,block_id,page_number,llm_output");
            writer.newLine();

            for (int i = 0; i < chunks.size(); i++) {

                TokenChunk c = chunks.get(i);
                String output = outputs.get(i)
                        .replace("\"", "\"\"");

                writer.write(
                        c.chunkId + "," +
                        c.blockId + "," +
                        c.pageNumber + ",\"" +
                        output + "\""
                );

                writer.newLine();
            }
        }
    }

    /*
    ###########################################################
    # 4️⃣ 실행 메인
    ###########################################################
    */
    public static void main(String[] args) throws Exception {

        Path chunkCsv = Paths.get("token_chunks.csv");
        Path resultCsv = Paths.get("llm_results.csv");

        List<TokenChunk> chunks = readChunkCsv(chunkCsv);

        List<String> outputs = new ArrayList<>();

        for (TokenChunk chunk : chunks) {

            System.out.println("LLM 처리 중: chunk " + chunk.chunkId);

            String result = callLLM(chunk.text);

            outputs.add(result);

            # API 과부하 방지
            Thread.sleep(500);
        }

        saveResults(chunks, outputs, resultCsv);

        System.out.println("완료: LLM 결과 저장 완료");
    }
}

OCR JSON
↓
Flatten CSV
↓
Rule-based CSV
↓
Token-safe Chunk CSV
↓
LLM 결과 CSV   ← 지금 코드

AstroScent

Leave a Reply Cancel reply

Comments

Archives

Categories