NightPrince commited on
Commit
fec11df
·
0 Parent(s):

Clean repo without binary files

Browse files
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pipeline.pipeline import CodeGenPipeline
3
+
4
+ st.set_page_config(page_title="CodeGenBot", page_icon="🤖")
5
+ st.title("💻 CodeGenBot")
6
+
7
+ # Initialize pipeline only once (cache in session state)
8
+ if "pipeline" not in st.session_state:
9
+ st.session_state.pipeline = CodeGenPipeline("hf://datasets/openai/openai_humaneval/openai_humaneval/test-00000-of-00001.parquet")
10
+
11
+ # Memory for chat messages
12
+ if "messages" not in st.session_state:
13
+ st.session_state.messages = []
14
+
15
+ # Display previous messages
16
+ for msg in st.session_state.messages:
17
+ if msg["role"] == "assistant":
18
+ st.chat_message("assistant").code(msg["content"], language="python")
19
+ else:
20
+ st.chat_message(msg["role"]).write(msg["content"])
21
+
22
+ # User input
23
+ user_input = st.chat_input("Ask CodeGenBot to generate Python code...")
24
+
25
+ if user_input:
26
+ # Save user message
27
+ st.session_state.messages.append({"role": "user", "content": user_input})
28
+ st.chat_message("user").write(user_input)
29
+
30
+ # Generate code using your pipeline
31
+ with st.spinner("Generating code..."):
32
+ try:
33
+ code_output = st.session_state.pipeline.generate_code_from_prompt(user_input)
34
+ except Exception as e:
35
+ code_output = f"Error: {e}"
36
+
37
+ # Save assistant's reply and rerun to display it immediately
38
+ st.session_state.messages.append({"role": "assistant", "content": code_output})
39
+ st.rerun()
pipeline/__init__.py ADDED
File without changes
pipeline/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (176 Bytes). View file
 
pipeline/__pycache__/codegen.cpython-310.pyc ADDED
Binary file (823 Bytes). View file
 
pipeline/__pycache__/embedding.cpython-310.pyc ADDED
Binary file (777 Bytes). View file
 
pipeline/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (1.49 kB). View file
 
pipeline/__pycache__/retrieval.cpython-310.pyc ADDED
Binary file (1.07 kB). View file
 
pipeline/codegen.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ from huggingface_hub import InferenceClient
4
+
5
+ load_dotenv()
6
+ API_KEY = os.getenv("HF_API_KEY")
7
+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
8
+
9
+
10
+ def generate_code_with_context(user_prompt, context=None):
11
+ client = InferenceClient(provider="nscale", api_key=API_KEY)
12
+ if context:
13
+ final_prompt = f"{context}\n\n# Your Task:\n{user_prompt}\n"
14
+ else:
15
+ final_prompt = user_prompt
16
+ completion = client.chat.completions.create(
17
+ model=MODEL_NAME,
18
+ messages=[{"role": "user", "content": final_prompt}],
19
+ )
20
+ return completion.choices[0].message.content
pipeline/embedding.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ class Embedder:
4
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
5
+ self.model = SentenceTransformer(model_name)
6
+
7
+ def encode(self, texts, batch_size=32, show_progress_bar=False):
8
+ return self.model.encode(texts, batch_size=batch_size, show_progress_bar=show_progress_bar)
pipeline/pipeline.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from .embedding import Embedder
3
+ from .retrieval import RetrievalDB
4
+ from .codegen import generate_code_with_context
5
+ class CodeGenPipeline:
6
+ def __init__(self, parquet_path):
7
+ self.df = pd.read_parquet(parquet_path)
8
+ self.prompts = self.df["prompt"].tolist()
9
+ self.solutions = self.df["canonical_solution"].tolist()
10
+ self.embedder = Embedder()
11
+ self.embeddings = self.embedder.encode(self.prompts, batch_size=32, show_progress_bar=True)
12
+ self.retrieval_db = RetrievalDB(self.prompts, self.embeddings, self.solutions)
13
+
14
+ def generate_code_from_prompt(self, user_prompt, k=1):
15
+ query_emb = self.embedder.encode([user_prompt])[0]
16
+ retrieved = self.retrieval_db.retrieve_similar_context(query_emb, k=k)[0]
17
+ context = "\n\n".join([f"# Task:\n{r['prompt']}\n{r['solution']}" for r in retrieved])
18
+ return generate_code_with_context(user_prompt, context)
pipeline/retrieval.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+
3
+ class RetrievalDB:
4
+ def __init__(self, prompts, embeddings, solutions, collection_name="humaneval"):
5
+ self.client = chromadb.Client()
6
+ self.collection = self.client.create_collection(name=collection_name)
7
+ for idx, (emb, prompt, solution) in enumerate(zip(embeddings, prompts, solutions)):
8
+ self.collection.add(
9
+ ids=[str(idx)],
10
+ embeddings=[emb.tolist()],
11
+ metadatas=[{"prompt": prompt, "solution": solution}]
12
+ )
13
+
14
+ def retrieve_similar_context(self, query_emb, k=1):
15
+ results = self.collection.query(query_embeddings=[query_emb], n_results=k)
16
+ return results["metadatas"]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ sentence-transformers
3
+ chromadb
4
+ huggingface_hub
5
+ pyarrow