joao-vectara commited on
Commit
a7621d6
·
verified ·
1 Parent(s): 5fc1d81

Update upload.py

Browse files

adding PDF support

Files changed (1) hide show
  1. upload.py +43 -1
upload.py CHANGED
@@ -1,14 +1,56 @@
1
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  def upload_file_to_vectara(file, customer_id, api_key, corpus_key):
4
  """Uploads a file to Vectara API v2."""
5
  url = f"https://api.vectara.io/v2/corpora/{corpus_key}/upload_file"
6
  headers = {
 
7
  "x-api-key": api_key,
8
  "Accept": "application/json"
9
  }
10
 
11
- files = {"file": (file.name, file.getvalue())}
 
 
 
 
 
 
 
 
 
12
  response = requests.post(url, headers=headers, files=files)
13
 
14
  return response.json()
 
1
  import requests
2
+ import pandas as pd
3
+ import io
4
+ from fpdf import FPDF
5
+ import json
6
+
7
+ def convert_xlsx_to_pdf(file):
8
+ """Converts an XLSX file to a PDF and returns a BytesIO object with a filename."""
9
+ excel_data = pd.ExcelFile(file)
10
+ pdf = FPDF()
11
+ pdf.set_auto_page_break(auto=True, margin=15)
12
+ pdf.add_page()
13
+ pdf.set_font("Arial", size=12)
14
+
15
+ for sheet_name in excel_data.sheet_names:
16
+ pdf.cell(200, 10, txt=f"Sheet: {sheet_name}", ln=True, align='C')
17
+ pdf.ln(10)
18
+ df = excel_data.parse(sheet_name)
19
+
20
+ for i in range(min(10, len(df))): # Limiting rows for readability
21
+ row_data = " | ".join(str(x) for x in df.iloc[i])
22
+ pdf.multi_cell(0, 10, row_data)
23
+
24
+ pdf.ln(5)
25
+
26
+ pdf_output = io.BytesIO()
27
+ pdf_output.write(pdf.output(dest='S').encode('latin1')) # Convert to bytes
28
+ pdf_output.seek(0)
29
+
30
+ # Manually add a 'name' attribute to mimic a file
31
+ pdf_output.name = file.name.replace(".xlsx", ".pdf")
32
+
33
+ return pdf_output
34
 
35
  def upload_file_to_vectara(file, customer_id, api_key, corpus_key):
36
  """Uploads a file to Vectara API v2."""
37
  url = f"https://api.vectara.io/v2/corpora/{corpus_key}/upload_file"
38
  headers = {
39
+ "customer-id": customer_id,
40
  "x-api-key": api_key,
41
  "Accept": "application/json"
42
  }
43
 
44
+ metadata = {"type_file": "excel"} if file.name.endswith('.xlsx') else {}
45
+
46
+ if file.name.endswith('.xlsx'):
47
+ file = convert_xlsx_to_pdf(file) # Convert XLSX to PDF
48
+
49
+ files = {
50
+ 'metadata': (None, json.dumps(metadata), 'application/json'),
51
+ "file": (file.name, file.getvalue())} # Now file.name exists
52
+
53
+
54
  response = requests.post(url, headers=headers, files=files)
55
 
56
  return response.json()