Last active
September 13, 2024 16:58
-
-
Save thanhdatpd/3b6fc1abead3774ecad24a93966bb9dc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fs from "fs"; | |
| import _ from "lodash"; | |
| import { mkConfig, generateCsv, asString } from "export-to-csv"; | |
| import { writeFile } from "node:fs"; | |
| import { Buffer } from "node:buffer"; | |
| // mkConfig merges your options with the defaults | |
| // and returns WithDefaults<ConfigOptions> | |
| const csvConfig = mkConfig({ useKeysAsHeaders: true }); | |
| // import PDFParser from "pdf2json"; | |
| // const pdfParser = new PDFParser(this, 1); | |
| // pdfParser.on("pdfParser_dataError", (errData) => | |
| // console.error(errData.parserError) | |
| // ); | |
| // pdfParser.on("pdfParser_dataReady", (pdfData) => { | |
| //fs.writeFile( | |
| // "./F1040EZ.json", | |
| // JSON.stringify(pdfData), | |
| // (data) => console.log(data) | |
| // ); | |
| // }); | |
| // pdfParser.loadPDF("./pdf.pdf"); | |
| console.clear(); | |
| const arrDate = [ | |
| "01/09/2024", | |
| "02/09/2024", | |
| "03/09/2024", | |
| "04/09/2024", | |
| "05/09/2024", | |
| "06/09/2024", | |
| "07/09/2024", | |
| "08/09/2024", | |
| "09/09/2024", | |
| "10/09/2024", | |
| ]; | |
| // x cord for credit | |
| // 123 [ 14.525, 14.9, 15.15, 15.4, 15.774999999999999, 16.025, 16.275 ] 422 | |
| //all x cord | |
| // 1.55, | |
| // 14.525, | |
| // 14.9, | |
| // 15.15, | |
| // 15.4, | |
| // 15.774999999999999, | |
| // 16.025, | |
| // 16.275, | |
| // 20.125, | |
| // 24.125, | |
| // 9.95 | |
| const isValidDate = (value) => arrDate.includes(value); | |
| const f2 = (value) => Number(value).toFixed(2); | |
| const nf2 = (value) => Number(Number(value).toFixed(2)); | |
| const compare_rounded = (value, value2) => | |
| Math.abs(Number(value) - Number(value2)) <= 0.011; | |
| const setX = new Set(); | |
| const setY = new Set(); | |
| let main = () => { | |
| let json_real = fs.readFileSync("./F1040EZ.json"); | |
| json_real = JSON.parse(json_real); | |
| const pages = json_real.Pages; | |
| const total_page = json_real.Pages.length; | |
| const from_page = 1, | |
| to_page = 12028; | |
| const array_all = []; | |
| let count_trans = 1; | |
| for (let i = from_page; i <= to_page; i++) { | |
| if (i > to_page - 1) { | |
| break; | |
| } | |
| console.log(`parse page ${i}`); | |
| const current_page = _.get(json_real.Pages, `[${i}]`); | |
| const page_texts = _.get(current_page, "Texts"); | |
| const arr_obj_of_one_page = []; | |
| //extra text from page | |
| for (let one_text_box of page_texts) { | |
| const text = _.map(one_text_box.R, ({ T }) => decodeURIComponent(T)).join( | |
| " " | |
| ); | |
| const x = one_text_box.x; | |
| const y = one_text_box.y; | |
| if (y < 3.891) continue; //start page | |
| if (text.includes(" of 12028")) break; //end page | |
| const text_obj = { x, y, value: _.trim(text), type: null }; | |
| if (x === 20.125) { | |
| text_obj.type = "balance"; | |
| } else if (x === 24.125) { | |
| text_obj.type = "detail"; | |
| } else if (x === 1.55 && isValidDate(text)) { | |
| text_obj.type = "date"; | |
| } else if (x === 1.55 && !isValidDate(text)) { | |
| text_obj.type = "time"; | |
| } else if (x == 9.95) { | |
| text_obj.type = "debit"; | |
| } else if (x >= 14.525 && x <= 16.275) { | |
| text_obj.type = "credit"; | |
| } | |
| arr_obj_of_one_page.push(text_obj); | |
| // if (!text_obj.type) { | |
| // console.log(x, text); | |
| // setX.add(x); | |
| // setY.add(y); | |
| // } | |
| } | |
| // console.log(arr_obj_of_one_page); | |
| //parse text for correct obj | |
| const list_of_trans_date = _.filter(arr_obj_of_one_page, ["type", "date"]); | |
| const list_of_trans_detail = _.filter(arr_obj_of_one_page, [ | |
| "type", | |
| "detail", | |
| ]); | |
| const list_of_trans_time = _.filter(arr_obj_of_one_page, ["type", "time"]); | |
| const list_of_trans_debit = _.filter(arr_obj_of_one_page, [ | |
| "type", | |
| "debit", | |
| ]); | |
| const list_of_trans_credit = _.filter(arr_obj_of_one_page, [ | |
| "type", | |
| "credit", | |
| ]); | |
| // console.log(list_of_trans_detail, list_of_trans_detail.length); | |
| let trans_details = []; | |
| let current_detail_arr = []; | |
| for (let index in list_of_trans_detail) { | |
| const current_cusor = list_of_trans_detail[index]; | |
| if (_.isEmpty(current_detail_arr)) { | |
| // console.log("start new line", current_cusor.value); | |
| current_detail_arr = [current_cusor]; | |
| continue; | |
| } | |
| const y_of_cursor = current_cusor.y; | |
| const y_of_last_current_detail_arr = _.last(current_detail_arr).y; | |
| const y_max_line_space = 0.77; | |
| // console.log( | |
| // Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)), | |
| // Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)) <= | |
| // y_max_line_space | |
| // ); | |
| //current cusor row still same trans detail | |
| if ( | |
| Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)) <= | |
| y_max_line_space | |
| ) { | |
| current_detail_arr.push(current_cusor); | |
| } else { | |
| const first_item = _.first(current_detail_arr); | |
| const full_text = _.map(current_detail_arr, "value").join(" "); | |
| trans_details.push({ | |
| value: full_text, | |
| first_y: first_item.y, | |
| }); | |
| current_detail_arr = [current_cusor]; | |
| } | |
| } | |
| if (!_.isEmpty(current_detail_arr)) { | |
| const first_item = _.first(current_detail_arr); | |
| const full_text = _.map(current_detail_arr, "value").join(" "); | |
| trans_details.push({ | |
| value: full_text, | |
| first_y: first_item.y, | |
| }); | |
| } | |
| // console.log(trans_details); | |
| // console.log(list_of_trans_time, list_of_trans_time.length); | |
| // console.log(list_of_trans_date, list_of_trans_date.length); | |
| // console.log(_.map(list_of_trans_credit, ({ y }) => Number(y).toFixed(2))); | |
| const obj_data = {}; | |
| _.map(list_of_trans_date, ({ x, y, value }, index) => { | |
| const y_of_credit_debit_rounded = f2(y + 0.356); | |
| const y_of_time = Number(y + 0.675).toFixed(2); | |
| const time_obj = _.find( | |
| list_of_trans_time, | |
| ({ y }) => Number(y).toFixed(2) == y_of_time | |
| ); | |
| const credit_obj = _.find(list_of_trans_credit, ({ y }) => | |
| compare_rounded(f2(y), y_of_credit_debit_rounded) | |
| ); | |
| const debit_obj = _.find(list_of_trans_debit, ({ y }) => | |
| compare_rounded(f2(y), y_of_credit_debit_rounded) | |
| ); | |
| // const first_detail_obj = _.find(list_of_trans_detail, ({ y }) => | |
| // compare_rounded(f2(y), y_of_credit_debit_rounded) | |
| // ); | |
| const detail = trans_details[index]; | |
| const time_value = time_obj.value; | |
| const credit = credit_obj?.value | |
| ? String(credit_obj.value).replaceAll(".", "") | |
| : 0; | |
| const debit = debit_obj?.value | |
| ? String(debit_obj.value).replaceAll(".", "") | |
| : 0; | |
| const key = `${value}_${time_value}`; | |
| const obj = { | |
| date_time: key, | |
| trans_no: count_trans, | |
| // y_date: y, | |
| // y_time: time_obj.y, | |
| // y_credit: credit_obj?.y || null, | |
| // y_first_detail: _.get(detail, "first_y", ""), | |
| credit, | |
| debit, | |
| detail: _.get(detail, "value", ""), | |
| // first_detail: first_detail_obj?.value || null, | |
| }; | |
| array_all.push(obj); | |
| count_trans++; | |
| }); | |
| // console.log(obj_data); | |
| // //handle for date | |
| // if (text_obj.type === "date") { | |
| // if (!arr_obj_of_one_page[y]) | |
| // arr_obj_of_one_page[y] = { | |
| // date: text_obj.value, | |
| // time: null, | |
| // credit: null, | |
| // debit: null, | |
| // detail: null, | |
| // }; | |
| // } | |
| // break; | |
| } | |
| console.log("total trans count", array_all.length); | |
| fs.writeFileSync("chuyen_khoan.json", JSON.stringify(array_all)); | |
| const csv = generateCsv(csvConfig)(array_all); | |
| const filename = `chuyen_khoan.csv`; | |
| const csvBuffer = new Uint8Array(Buffer.from(asString(csv))); | |
| // Write the csv file to disk | |
| writeFile(filename, csvBuffer, (err) => { | |
| if (err) throw err; | |
| console.log("file saved: ", filename); | |
| }); | |
| // console.log(123, Array.from(setX).sort(), setY.size); | |
| }; | |
| main(); | |
| // console.log(Object.keys(pages)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment