package naoki.openai; import com.mongodb.client.MongoClients; import com.mongodb.client.model.Filters; import com.theokanning.openai.OpenAiHttpException; import com.theokanning.openai.embedding.EmbeddingRequest; import com.theokanning.openai.embedding.EmbeddingResult; import com.theokanning.openai.service.OpenAiService; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.util.List; public class HatenaReader { static class Header{ String baseName; String image; String title; String date; boolean published;} public record BlogEntry ( String title, String baseName, String image, String date, boolean published, String body, String stripedBody, List vector) { } static String getToken() { return System.getenv("OPENAI_TOKEN"); } static final String HATENA_DATA = "nowokay.hatenablog.com.export.txt"; public static void main(String[] args) throws IOException, InterruptedException { var path = Path.of(HATENA_DATA); try (var bur = Files.newBufferedReader(path); var client = MongoClients.create("mongodb://localhost:27017")) { var db = client.getDatabase("blog_db"); var coll = db.getCollection("entries", BlogEntry.class); var service = new OpenAiService(getToken(), Duration.ZERO); enum Part {HEADER, CONTENT, BODY, COMMENT} Part p = Part.HEADER; int lineCount = 0; int docCount = 0; StringBuilder body = new StringBuilder(); StringBuilder striped = new StringBuilder(); Header h = new Header(); for (String line; (line = bur.readLine()) != null; ) { switch (p) { case HEADER -> { if (line.startsWith("BASENAME")) { System.out.println("bn:" + (h.baseName = line.substring("BASENAME: ".length()))); } else if(line.startsWith("IMAGE")) { System.out.println("img:" + (h.image = line.substring("IMAGE: ".length()))); } else if(line.startsWith("TITLE")) { System.out.println("title:" + (h.title = line.substring("TITLE: ".length()))); } else if(line.startsWith("DATE")) { System.out.println("date:" + (h.date = line.substring("DATE: ".length()))); } else if(line.equals("STATUS: Publish")) { h.published = true; } else if (line.equals("-----")) { p = Part.CONTENT; } } case CONTENT -> { p = Part.BODY; body.setLength(0); striped.setLength(0); lineCount = 0; ++docCount; } case BODY -> { if (line.equals("-----")) { p = Part.COMMENT; } else { var s = line.replaceAll("<[a-z/][^>]*>", ""); striped.append(s).append("\n"); body.append(line).append("\n"); if (lineCount++ < 3) { System.out.println(s); } } } case COMMENT -> { if (h.baseName == null || h.date == null || h.title == null) { System.out.println("!!"); return; } if (line.equals("--------")) { if (coll.find(Filters.eq("baseName", h.baseName)).first() == null) { var text = striped.toString(); var req = EmbeddingRequest.builder() .user("dummy") .model("text-embedding-ada-002") .input(List.of(text.substring(0, Math.min(text.length(), 4000)))).build(); EmbeddingResult res = null; for (int i = 0; i < 5; ++i) { try { res = service.createEmbeddings(req); } catch (OpenAiHttpException ex) { System.out.println(ex.getMessage()); Thread.sleep(Duration.ofMinutes(1)); continue; } break; } if (res == null) { System.out.println("retry 5 times but could not access"); return; } BlogEntry ent = new BlogEntry( h.title, h.baseName, h.image, h.date, h.published, body.toString(), text, res.getData().get(0).getEmbedding()); coll.insertOne(ent); System.out.println(ent.vector); System.out.println("---"); Thread.sleep(Duration.ofSeconds(3).plusMillis(100)); // 20 request per min for the rate limit } p = Part.HEADER; h = new Header(); } } } } System.out.println(docCount); } } }