Skip to content

Instantly share code, notes, and snippets.

@thanhdatpd
Last active September 13, 2024 16:58
Show Gist options
  • Select an option

  • Save thanhdatpd/3b6fc1abead3774ecad24a93966bb9dc to your computer and use it in GitHub Desktop.

Select an option

Save thanhdatpd/3b6fc1abead3774ecad24a93966bb9dc to your computer and use it in GitHub Desktop.
import fs from "fs";
import _ from "lodash";
import { mkConfig, generateCsv, asString } from "export-to-csv";
import { writeFile } from "node:fs";
import { Buffer } from "node:buffer";
// mkConfig merges your options with the defaults
// and returns WithDefaults<ConfigOptions>
const csvConfig = mkConfig({ useKeysAsHeaders: true });
// import PDFParser from "pdf2json";
// const pdfParser = new PDFParser(this, 1);
// pdfParser.on("pdfParser_dataError", (errData) =>
// console.error(errData.parserError)
// );
// pdfParser.on("pdfParser_dataReady", (pdfData) => {
//fs.writeFile(
// "./F1040EZ.json",
// JSON.stringify(pdfData),
// (data) => console.log(data)
// );
// });
// pdfParser.loadPDF("./pdf.pdf");
console.clear();
const arrDate = [
"01/09/2024",
"02/09/2024",
"03/09/2024",
"04/09/2024",
"05/09/2024",
"06/09/2024",
"07/09/2024",
"08/09/2024",
"09/09/2024",
"10/09/2024",
];
// x cord for credit
// 123 [ 14.525, 14.9, 15.15, 15.4, 15.774999999999999, 16.025, 16.275 ] 422
//all x cord
// 1.55,
// 14.525,
// 14.9,
// 15.15,
// 15.4,
// 15.774999999999999,
// 16.025,
// 16.275,
// 20.125,
// 24.125,
// 9.95
const isValidDate = (value) => arrDate.includes(value);
const f2 = (value) => Number(value).toFixed(2);
const nf2 = (value) => Number(Number(value).toFixed(2));
const compare_rounded = (value, value2) =>
Math.abs(Number(value) - Number(value2)) <= 0.011;
const setX = new Set();
const setY = new Set();
let main = () => {
let json_real = fs.readFileSync("./F1040EZ.json");
json_real = JSON.parse(json_real);
const pages = json_real.Pages;
const total_page = json_real.Pages.length;
const from_page = 1,
to_page = 12028;
const array_all = [];
let count_trans = 1;
for (let i = from_page; i <= to_page; i++) {
if (i > to_page - 1) {
break;
}
console.log(`parse page ${i}`);
const current_page = _.get(json_real.Pages, `[${i}]`);
const page_texts = _.get(current_page, "Texts");
const arr_obj_of_one_page = [];
//extra text from page
for (let one_text_box of page_texts) {
const text = _.map(one_text_box.R, ({ T }) => decodeURIComponent(T)).join(
" "
);
const x = one_text_box.x;
const y = one_text_box.y;
if (y < 3.891) continue; //start page
if (text.includes(" of 12028")) break; //end page
const text_obj = { x, y, value: _.trim(text), type: null };
if (x === 20.125) {
text_obj.type = "balance";
} else if (x === 24.125) {
text_obj.type = "detail";
} else if (x === 1.55 && isValidDate(text)) {
text_obj.type = "date";
} else if (x === 1.55 && !isValidDate(text)) {
text_obj.type = "time";
} else if (x == 9.95) {
text_obj.type = "debit";
} else if (x >= 14.525 && x <= 16.275) {
text_obj.type = "credit";
}
arr_obj_of_one_page.push(text_obj);
// if (!text_obj.type) {
// console.log(x, text);
// setX.add(x);
// setY.add(y);
// }
}
// console.log(arr_obj_of_one_page);
//parse text for correct obj
const list_of_trans_date = _.filter(arr_obj_of_one_page, ["type", "date"]);
const list_of_trans_detail = _.filter(arr_obj_of_one_page, [
"type",
"detail",
]);
const list_of_trans_time = _.filter(arr_obj_of_one_page, ["type", "time"]);
const list_of_trans_debit = _.filter(arr_obj_of_one_page, [
"type",
"debit",
]);
const list_of_trans_credit = _.filter(arr_obj_of_one_page, [
"type",
"credit",
]);
// console.log(list_of_trans_detail, list_of_trans_detail.length);
let trans_details = [];
let current_detail_arr = [];
for (let index in list_of_trans_detail) {
const current_cusor = list_of_trans_detail[index];
if (_.isEmpty(current_detail_arr)) {
// console.log("start new line", current_cusor.value);
current_detail_arr = [current_cusor];
continue;
}
const y_of_cursor = current_cusor.y;
const y_of_last_current_detail_arr = _.last(current_detail_arr).y;
const y_max_line_space = 0.77;
// console.log(
// Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)),
// Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)) <=
// y_max_line_space
// );
//current cusor row still same trans detail
if (
Math.abs(nf2(y_of_last_current_detail_arr) - nf2(y_of_cursor)) <=
y_max_line_space
) {
current_detail_arr.push(current_cusor);
} else {
const first_item = _.first(current_detail_arr);
const full_text = _.map(current_detail_arr, "value").join(" ");
trans_details.push({
value: full_text,
first_y: first_item.y,
});
current_detail_arr = [current_cusor];
}
}
if (!_.isEmpty(current_detail_arr)) {
const first_item = _.first(current_detail_arr);
const full_text = _.map(current_detail_arr, "value").join(" ");
trans_details.push({
value: full_text,
first_y: first_item.y,
});
}
// console.log(trans_details);
// console.log(list_of_trans_time, list_of_trans_time.length);
// console.log(list_of_trans_date, list_of_trans_date.length);
// console.log(_.map(list_of_trans_credit, ({ y }) => Number(y).toFixed(2)));
const obj_data = {};
_.map(list_of_trans_date, ({ x, y, value }, index) => {
const y_of_credit_debit_rounded = f2(y + 0.356);
const y_of_time = Number(y + 0.675).toFixed(2);
const time_obj = _.find(
list_of_trans_time,
({ y }) => Number(y).toFixed(2) == y_of_time
);
const credit_obj = _.find(list_of_trans_credit, ({ y }) =>
compare_rounded(f2(y), y_of_credit_debit_rounded)
);
const debit_obj = _.find(list_of_trans_debit, ({ y }) =>
compare_rounded(f2(y), y_of_credit_debit_rounded)
);
// const first_detail_obj = _.find(list_of_trans_detail, ({ y }) =>
// compare_rounded(f2(y), y_of_credit_debit_rounded)
// );
const detail = trans_details[index];
const time_value = time_obj.value;
const credit = credit_obj?.value
? String(credit_obj.value).replaceAll(".", "")
: 0;
const debit = debit_obj?.value
? String(debit_obj.value).replaceAll(".", "")
: 0;
const key = `${value}_${time_value}`;
const obj = {
date_time: key,
trans_no: count_trans,
// y_date: y,
// y_time: time_obj.y,
// y_credit: credit_obj?.y || null,
// y_first_detail: _.get(detail, "first_y", ""),
credit,
debit,
detail: _.get(detail, "value", ""),
// first_detail: first_detail_obj?.value || null,
};
array_all.push(obj);
count_trans++;
});
// console.log(obj_data);
// //handle for date
// if (text_obj.type === "date") {
// if (!arr_obj_of_one_page[y])
// arr_obj_of_one_page[y] = {
// date: text_obj.value,
// time: null,
// credit: null,
// debit: null,
// detail: null,
// };
// }
// break;
}
console.log("total trans count", array_all.length);
fs.writeFileSync("chuyen_khoan.json", JSON.stringify(array_all));
const csv = generateCsv(csvConfig)(array_all);
const filename = `chuyen_khoan.csv`;
const csvBuffer = new Uint8Array(Buffer.from(asString(csv)));
// Write the csv file to disk
writeFile(filename, csvBuffer, (err) => {
if (err) throw err;
console.log("file saved: ", filename);
});
// console.log(123, Array.from(setX).sort(), setY.size);
};
main();
// console.log(Object.keys(pages));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment