11import axios from 'axios' ;
22import { PDFExtract } from 'pdf.js-extract' ;
3+ import * as XLSX from 'xlsx' ;
4+ import pkg from 'papaparse' ;
5+ const { parse } = pkg ;
36
47async function extractFileContent ( attachment ) {
58 const response = await axios . get ( attachment . url , { responseType : 'arraybuffer' } ) ;
69 const buffer = Buffer . from ( response . data ) ;
710
8- if ( attachment . name . endsWith ( '.pdf' ) ) {
9- try {
10- const pdfExtract = new PDFExtract ( ) ;
11- const data = await pdfExtract . extractBuffer ( buffer ) ;
12- const text = data . pages
13- . map ( page => page . content . map ( item => item . str ) . join ( ' ' ) )
14- . join ( '\n\n' ) ;
15- return text ;
16- } catch ( error ) {
17- console . error ( 'Error parsing PDF:' , error ) ;
18- throw new Error ( 'Failed to parse PDF file' ) ;
19- }
20- } else if ( attachment . name . endsWith ( '.txt' ) ) {
21- return buffer . toString ( 'utf-8' ) ;
11+ const extension = attachment . name . split ( '.' ) . pop ( ) . toLowerCase ( ) ;
12+
13+ switch ( extension ) {
14+ case 'pdf' :
15+ return await extractPdfContent ( buffer ) ;
16+ case 'txt' :
17+ return buffer . toString ( 'utf-8' ) ;
18+ case 'csv' :
19+ return await extractCsvContent ( buffer ) ;
20+ case 'xls' :
21+ case 'xlsx' :
22+ return extractExcelContent ( buffer ) ;
23+ default :
24+ throw new Error ( 'Unsupported file type' ) ;
25+ }
26+ }
27+
28+ async function extractPdfContent ( buffer ) {
29+ try {
30+ const pdfExtract = new PDFExtract ( ) ;
31+ const data = await pdfExtract . extractBuffer ( buffer ) ;
32+ return data . pages
33+ . map ( page => page . content . map ( item => item . str ) . join ( ' ' ) )
34+ . join ( '\n\n' ) ;
35+ } catch ( error ) {
36+ console . error ( 'Error parsing PDF:' , error ) ;
37+ throw new Error ( 'Failed to parse PDF file' ) ;
2238 }
39+ }
40+
41+ async function extractCsvContent ( buffer ) {
42+ try {
43+ const csvText = buffer . toString ( 'utf-8' ) ;
44+ const result = parse ( csvText , {
45+ header : true ,
46+ skipEmptyLines : true ,
47+ dynamicTyping : true
48+ } ) ;
49+
50+ // Format CSV data into a readable table structure
51+ if ( result . data . length === 0 ) return 'Empty CSV file' ;
2352
24- throw new Error ( 'Unsupported file type' ) ;
53+ const headers = Object . keys ( result . data [ 0 ] ) ;
54+ const summary = `Table with ${ result . data . length } rows and the following columns: ${ headers . join ( ', ' ) } \n\n` ;
55+
56+ // Convert data to a readable format
57+ const formattedData = result . data . slice ( 0 , 10 ) . map ( row => {
58+ return headers . map ( header => `${ header } : ${ row [ header ] } ` ) . join ( ', ' ) ;
59+ } ) . join ( '\n' ) ;
60+
61+ return summary + formattedData + ( result . data . length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : '' ) ;
62+ } catch ( error ) {
63+ console . error ( 'Error parsing CSV:' , error ) ;
64+ throw new Error ( 'Failed to parse CSV file' ) ;
65+ }
66+ }
67+
68+ function extractExcelContent ( buffer ) {
69+ try {
70+ const workbook = XLSX . read ( buffer ) ;
71+ const result = [ ] ;
72+
73+ for ( const sheetName of workbook . SheetNames ) {
74+ const sheet = workbook . Sheets [ sheetName ] ;
75+ const data = XLSX . utils . sheet_to_json ( sheet ) ;
76+
77+ if ( data . length === 0 ) continue ;
78+
79+ const headers = Object . keys ( data [ 0 ] ) ;
80+ const summary = `Sheet "${ sheetName } " with ${ data . length } rows and the following columns: ${ headers . join ( ', ' ) } \n\n` ;
81+
82+ // Convert data to a readable format
83+ const formattedData = data . slice ( 0 , 10 ) . map ( row => {
84+ return headers . map ( header => `${ header } : ${ row [ header ] } ` ) . join ( ', ' ) ;
85+ } ) . join ( '\n' ) ;
86+
87+ result . push ( summary + formattedData + ( data . length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : '' ) ) ;
88+ }
89+
90+ return result . join ( '\n\n=== Next Sheet ===\n\n' ) || 'Empty Excel file' ;
91+ } catch ( error ) {
92+ console . error ( 'Error parsing Excel file:' , error ) ;
93+ throw new Error ( 'Failed to parse Excel file' ) ;
94+ }
2595}
2696
2797export default extractFileContent ;
0 commit comments