File size: 888 Bytes
c731612
 
 
a803cdd
c731612
 
dc50cde
 
 
 
 
c68949e
 
c731612
 
 
 
 
 
 
 
 
 
ca1c8f9
c731612
4fc3024
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import download_and_extract_zip

def gen_splits():

    URL = os.getenv('URL')
    destination_folder = os.getcwd()
    download_and_extract_zip.download_and_extract_zip(URL, destination_folder)
    
    file_paths = os.listdir('Model_TS_Full')
    new_file_paths = [os.getcwd() +"/Model_TS_Full/"+ file for file in file_paths]
    loaders = []
    for file_path in new_file_paths:
        if file_path.lower().endswith(".pdf"):
            loaders.append(PyPDFLoader(file_path))
    
    docs = []
    for loader in loaders:
        docs.extend(loader.load())
    
    # Splitting Documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap = 500)
    splits = text_splitter.split_documents(docs)
    return splits