import java.io.*;
import java.nio.file.*;
import java.util.*;
import java.util.regex.*;
import java.util.stream.*;

public class CsvCleaner {

    private static final Pattern TEMPLATE_PATTERN = Pattern.compile("\\$\\{[^}]*}");
    private static final Pattern SECTION_PATTERN = Pattern.compile("-\\s*[ivxIVX]+\\s*-");
    private static final Pattern MULTI_SPACE = Pattern.compile("[ \\t]{2,}");
    private static final Pattern MULTI_NEWLINE = Pattern.compile("\\n{2,}");

    public static void main(String[] args) throws Exception {

        Path input = Paths.get("EST_LLM_1.csv");
        Path output = Paths.get("EST_LLM_1_cleaned.csv");

        List<String> lines = Files.readAllLines(input);

        List<String> cleaned = lines.stream()
                .map(CsvCleaner::normalizeLine)
                .filter(s -> !s.isBlank())
                .collect(Collectors.toList());

        Files.write(output, cleaned);

        System.out.println("정리 완료 → " + output.toAbsolutePath());
    }

    private static String normalizeLine(String line) {

        if (line == null) return "";

        // 1. ${...} 제거
        line = TEMPLATE_PATTERN.matcher(line).replaceAll("");

        // 2. - ii - 같은 섹션 제거
        line = SECTION_PATTERN.matcher(line).replaceAll("");

        // 3. 중괄호/대괄호 잔여 제거
        line = line.replace("{", "")
                   .replace("}", "")
                   .replace("[", "")
                   .replace("]", "");

        // 4. 공백 정리
        line = MULTI_SPACE.matcher(line).replaceAll(" ");

        return line.trim();
    }
}
private static List<String> mergeParagraphs(List<String> lines) {

    List<String> result = new ArrayList<>();
    StringBuilder sb = new StringBuilder();

    for (String line : lines) {

        if (line.endsWith("다.") || line.endsWith("있다.") || line.endsWith(".")) {
            sb.append(line);
            result.add(sb.toString().trim());
            sb.setLength(0);
        } else {
            sb.append(line).append(" ");
        }
    }

    if (sb.length() > 0) {
        result.add(sb.toString().trim());
    }

    return result;
}
List<String> cleaned = mergeParagraphs(
        lines.stream()
                .map(CsvCleaner::normalizeLine)
                .collect(Collectors.toList())
);
line = line.replaceAll("\"", "");
line = line.replaceAll("\\\\", "");

Leave a Reply

Your email address will not be published. Required fields are marked *