table support

Decryptu · Decryptu · commit 3f301166b8d0 · 2024-12-07T12:51:58.000+01:00
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -34,7 +34,9 @@
         "dotenv": "^16.4.5",
         "form-data": "^4.0.1",
         "openai": "^4.73.0",
-        "pdf.js-extract": "^0.2.1"
+        "papaparse": "^5.4.1",
+        "pdf.js-extract": "^0.2.1",
+        "xlsx": "^0.18.5"
     },
     "engines": {
         "node": ">=18.0.0"
diff --git a/utils/buildConversationLog.js b/utils/buildConversationLog.js
@@ -7,6 +7,8 @@ import {
 import transcribeVoiceMessage from "./transcribeVoiceMessage.js";
 import extractFileContent from "./extractFileContent.js";
 
+const SUPPORTED_FILE_TYPES = ['.pdf', '.txt', '.csv', '.xls', '.xlsx'];
+
 async function buildConversationLog(message, client) {
   const conversationLog = [
     {
@@ -50,40 +52,55 @@ async function buildConversationLog(message, client) {
     const attachmentPromises = Array.from(message.attachments.values()).map(async (attachment) => {
       if (attachment.name.endsWith('.ogg')) return null;
       
-      // Handle PDFs and text files
-      if (attachment.name.endsWith('.pdf') || attachment.name.endsWith('.txt')) {
+      // Check if this is a supported file type
+      const isExtractableFile = SUPPORTED_FILE_TYPES.some(ext => attachment.name.toLowerCase().endsWith(ext));
+      
+      if (isExtractableFile) {
         try {
+          console.log(`Processing file: ${attachment.name}`);
           const extractedText = await extractFileContent(attachment);
           return {
             type: "text",
-            text: `Content from ${attachment.name}:\n${extractedText}`,
+            text: `Content from ${attachment.name}:\n${extractedText}`
           };
         } catch (error) {
           console.error(`Error extracting content from ${attachment.name}:`, error);
-          return null;
+          return {
+            type: "text",
+            text: `Failed to process ${attachment.name}: ${error.message}`
+          };
         }
       }
       
-      // Handle images
+      // If not a supported file type, treat as image
+      if (attachment.contentType?.startsWith('image/')) {
+        return {
+          type: "image_url",
+          image_url: { url: attachment.url }
+        };
+      }
+      
+      // For unsupported file types, add a note
       return {
-        type: "image_url",
-        image_url: { url: attachment.url },
+        type: "text",
+        text: `Note: File ${attachment.name} is not a supported format.`
       };
     });
 
     const processedAttachments = (await Promise.all(attachmentPromises)).filter(Boolean);
 
     if (processedAttachments.length > 0) {
-      conversationLog.push({
+      const userMessage = {
         role: "user",
         content: [
           {
             type: "text",
-            text: message.content,
+            text: message.content || "Please analyze this file."
           },
-          ...processedAttachments,
-        ],
-      });
+          ...processedAttachments
+        ]
+      };
+      conversationLog.push(userMessage);
     }
   } else {
     conversationLog.push({
diff --git a/utils/extractFileContent.js b/utils/extractFileContent.js
@@ -1,27 +1,97 @@
 import axios from 'axios';
 import { PDFExtract } from 'pdf.js-extract';
+import * as XLSX from 'xlsx';
+import pkg from 'papaparse';
+const { parse } = pkg;
 
 async function extractFileContent(attachment) {
   const response = await axios.get(attachment.url, { responseType: 'arraybuffer' });
   const buffer = Buffer.from(response.data);
 
-  if (attachment.name.endsWith('.pdf')) {
-    try {
-      const pdfExtract = new PDFExtract();
-      const data = await pdfExtract.extractBuffer(buffer);
-      const text = data.pages
-        .map(page => page.content.map(item => item.str).join(' '))
-        .join('\n\n');
-      return text;
-    } catch (error) {
-      console.error('Error parsing PDF:', error);
-      throw new Error('Failed to parse PDF file');
-    }
-  } else if (attachment.name.endsWith('.txt')) {
-    return buffer.toString('utf-8');
+  const extension = attachment.name.split('.').pop().toLowerCase();
+
+  switch (extension) {
+    case 'pdf':
+      return await extractPdfContent(buffer);
+    case 'txt':
+      return buffer.toString('utf-8');
+    case 'csv':
+      return await extractCsvContent(buffer);
+    case 'xls':
+    case 'xlsx':
+      return extractExcelContent(buffer);
+    default:
+      throw new Error('Unsupported file type');
+  }
+}
+
+async function extractPdfContent(buffer) {
+  try {
+    const pdfExtract = new PDFExtract();
+    const data = await pdfExtract.extractBuffer(buffer);
+    return data.pages
+      .map(page => page.content.map(item => item.str).join(' '))
+      .join('\n\n');
+  } catch (error) {
+    console.error('Error parsing PDF:', error);
+    throw new Error('Failed to parse PDF file');
   }
+}
+
+async function extractCsvContent(buffer) {
+  try {
+    const csvText = buffer.toString('utf-8');
+    const result = parse(csvText, {
+      header: true,
+      skipEmptyLines: true,
+      dynamicTyping: true
+    });
+
+    // Format CSV data into a readable table structure
+    if (result.data.length === 0) return 'Empty CSV file';
 
-  throw new Error('Unsupported file type');
+    const headers = Object.keys(result.data[0]);
+    const summary = `Table with ${result.data.length} rows and the following columns: ${headers.join(', ')}\n\n`;
+    
+    // Convert data to a readable format
+    const formattedData = result.data.slice(0, 10).map(row => {
+      return headers.map(header => `${header}: ${row[header]}`).join(', ');
+    }).join('\n');
+
+    return summary + formattedData + (result.data.length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : '');
+  } catch (error) {
+    console.error('Error parsing CSV:', error);
+    throw new Error('Failed to parse CSV file');
+  }
+}
+
+function extractExcelContent(buffer) {
+  try {
+    const workbook = XLSX.read(buffer);
+    const result = [];
+
+    for (const sheetName of workbook.SheetNames) {
+      const sheet = workbook.Sheets[sheetName];
+      const data = XLSX.utils.sheet_to_json(sheet);
+
+      if (data.length === 0) continue;
+
+      const headers = Object.keys(data[0]);
+      const summary = `Sheet "${sheetName}" with ${data.length} rows and the following columns: ${headers.join(', ')}\n\n`;
+      
+      // Convert data to a readable format
+      const formattedData = data.slice(0, 10).map(row => {
+        return headers.map(header => `${header}: ${row[header]}`).join(', ');
+      }).join('\n');
+
+      result.push(summary + formattedData + (data.length > 10 ? '\n\n[Table truncated, showing first 10 rows]' : ''));
+    }
+
+    return result.join('\n\n=== Next Sheet ===\n\n') || 'Empty Excel file';
+  } catch (error) {
+    console.error('Error parsing Excel file:', error);
+    throw new Error('Failed to parse Excel file');
+  }
 }
 
 export default extractFileContent;