Skip to content

Commit 3f30116

Browse files
committed
table support
1 parent 2f14442 commit 3f30116

4 files changed

Lines changed: 229 additions & 29 deletions

File tree

package-lock.json

Lines changed: 112 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
"dotenv": "^16.4.5",
3535
"form-data": "^4.0.1",
3636
"openai": "^4.73.0",
37-
"pdf.js-extract": "^0.2.1"
37+
"papaparse": "^5.4.1",
38+
"pdf.js-extract": "^0.2.1",
39+
"xlsx": "^0.18.5"
3840
},
3941
"engines": {
4042
"node": ">=18.0.0"

utils/buildConversationLog.js

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import {
77
import transcribeVoiceMessage from "./transcribeVoiceMessage.js";
88
import extractFileContent from "./extractFileContent.js";
99

10+
const SUPPORTED_FILE_TYPES = ['.pdf', '.txt', '.csv', '.xls', '.xlsx'];
11+
1012
async function buildConversationLog(message, client) {
1113
const conversationLog = [
1214
{
@@ -50,40 +52,55 @@ async function buildConversationLog(message, client) {
5052
const attachmentPromises = Array.from(message.attachments.values()).map(async (attachment) => {
5153
if (attachment.name.endsWith('.ogg')) return null;
5254

53-
// Handle PDFs and text files
54-
if (attachment.name.endsWith('.pdf') || attachment.name.endsWith('.txt')) {
55+
// Check if this is a supported file type
56+
const isExtractableFile = SUPPORTED_FILE_TYPES.some(ext => attachment.name.toLowerCase().endsWith(ext));
57+
58+
if (isExtractableFile) {
5559
try {
60+
console.log(`Processing file: ${attachment.name}`);
5661
const extractedText = await extractFileContent(attachment);
5762
return {
5863
type: "text",
59-
text: `Content from ${attachment.name}:\n${extractedText}`,
64+
text: `Content from ${attachment.name}:\n${extractedText}`
6065
};
6166
} catch (error) {
6267
console.error(`Error extracting content from ${attachment.name}:`, error);
63-
return null;
68+
return {
69+
type: "text",
70+
text: `Failed to process ${attachment.name}: ${error.message}`
71+
};
6472
}
6573
}
6674

67-
// Handle images
75+
// If not a supported file type, treat as image
76+
if (attachment.contentType?.startsWith('image/')) {
77+
return {
78+
type: "image_url",
79+
image_url: { url: attachment.url }
80+
};
81+
}
82+
83+
// For unsupported file types, add a note
6884
return {
69-
type: "image_url",
70-
image_url: { url: attachment.url },
85+
type: "text",
86+
text: `Note: File ${attachment.name} is not a supported format.`
7187
};
7288
});
7389

7490
const processedAttachments = (await Promise.all(attachmentPromises)).filter(Boolean);
7591

7692
if (processedAttachments.length > 0) {
77-
conversationLog.push({
93+
const userMessage = {
7894
role: "user",
7995
content: [
8096
{
8197
type: "text",
82-
text: message.content,
98+
text: message.content || "Please analyze this file."
8399
},
84-
...processedAttachments,
85-
],
86-
});
100+
...processedAttachments
101+
]
102+
};
103+
conversationLog.push(userMessage);
87104
}
88105
} else {
89106
conversationLog.push({

utils/extractFileContent.js

Lines changed: 85 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,97 @@
11
import axios from 'axios';
22
import { PDFExtract } from 'pdf.js-extract';
3+
import * as XLSX from 'xlsx';
4+
import pkg from 'papaparse';
5+
const { parse } = pkg;
36

47
async function extractFileContent(attachment) {
58
const response = await axios.get(attachment.url, { responseType: 'arraybuffer' });
69
const buffer = Buffer.from(response.data);
710

8-
if (attachment.name.endsWith('.pdf')) {
9-
try {
10-
const pdfExtract = new PDFExtract();
11-
const data = await pdfExtract.extractBuffer(buffer);
12-
const text = data.pages
13-
.map(page => page.content.map(item => item.str).join(' '))
14-
.join('\n\n');
15-
return text;
16-
} catch (error) {
17-
console.error('Error parsing PDF:', error);
18-
throw new Error('Failed to parse PDF file');
19-
}
20-
} else if (attachment.name.endsWith('.txt')) {
21-
return buffer.toString('utf-8');
11+
const extension = attachment.name.split('.').pop().toLowerCase();
12+
13+
switch (extension) {
14+
case 'pdf':
15+
return await extractPdfContent(buffer);
16+
case 'txt':
17+
return buffer.toString('utf-8');
18+
case 'csv':
19+
return await extractCsvContent(buffer);
20+
case 'xls':
21+
case 'xlsx':
22+
return extractExcelContent(buffer);
23+
default:
24+
throw new Error('Unsupported file type');
25+
}
26+
}
27+
28+
async function extractPdfContent(buffer) {
29+
try {
30+
const pdfExtract = new PDFExtract();
31+
const data = await pdfExtract.extractBuffer(buffer);
32+
return data.pages
33+
.map(page => page.content.map(item => item.str).join(' '))
34+
.join('\n\n');
35+
} catch (error) {
36+
console.error('Error parsing PDF:', error);
37+
throw new Error('Failed to parse PDF file');
2238
}
39+
}
40+
41+
async function extractCsvContent(buffer) {
42+
try {
43+
const csvText = buffer.toString('utf-8');
44+
const result = parse(csvText, {
45+
header: true,
46+
skipEmptyLines: true,
47+
dynamicTyping: true
48+
});
49+
50+
// Format CSV data into a readable table structure
51+
if (result.data.length === 0) return 'Empty CSV file';
2352

24-
throw new Error('Unsupported file type');
53+
const headers = Object.keys(result.data[0]);
54+
const summary = `Table with ${result.data.length} rows and the following columns: ${headers.join(', ')}\n\n`;
55+
56+
// Convert data to a readable format
57+
const formattedData = result.data.slice(0, 10).map(row => {
58+
return headers.map(header => `${header}: ${row[header]}`).join(', ');
59+
}).join('\n');
60+
61+
return summary + formattedData + (result.data.length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : '');
62+
} catch (error) {
63+
console.error('Error parsing CSV:', error);
64+
throw new Error('Failed to parse CSV file');
65+
}
66+
}
67+
68+
function extractExcelContent(buffer) {
69+
try {
70+
const workbook = XLSX.read(buffer);
71+
const result = [];
72+
73+
for (const sheetName of workbook.SheetNames) {
74+
const sheet = workbook.Sheets[sheetName];
75+
const data = XLSX.utils.sheet_to_json(sheet);
76+
77+
if (data.length === 0) continue;
78+
79+
const headers = Object.keys(data[0]);
80+
const summary = `Sheet "${sheetName}" with ${data.length} rows and the following columns: ${headers.join(', ')}\n\n`;
81+
82+
// Convert data to a readable format
83+
const formattedData = data.slice(0, 10).map(row => {
84+
return headers.map(header => `${header}: ${row[header]}`).join(', ');
85+
}).join('\n');
86+
87+
result.push(summary + formattedData + (data.length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : ''));
88+
}
89+
90+
return result.join('\n\n=== Next Sheet ===\n\n') || 'Empty Excel file';
91+
} catch (error) {
92+
console.error('Error parsing Excel file:', error);
93+
throw new Error('Failed to parse Excel file');
94+
}
2595
}
2696

2797
export default extractFileContent;

0 commit comments

Comments
 (0)