Commit
·
fec11df
0
Parent(s):
Clean repo without binary files
Browse files- app.py +39 -0
- pipeline/__init__.py +0 -0
- pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- pipeline/__pycache__/codegen.cpython-310.pyc +0 -0
- pipeline/__pycache__/embedding.cpython-310.pyc +0 -0
- pipeline/__pycache__/pipeline.cpython-310.pyc +0 -0
- pipeline/__pycache__/retrieval.cpython-310.pyc +0 -0
- pipeline/codegen.py +20 -0
- pipeline/embedding.py +8 -0
- pipeline/pipeline.py +18 -0
- pipeline/retrieval.py +16 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pipeline.pipeline import CodeGenPipeline
|
3 |
+
|
4 |
+
st.set_page_config(page_title="CodeGenBot", page_icon="🤖")
|
5 |
+
st.title("💻 CodeGenBot")
|
6 |
+
|
7 |
+
# Initialize pipeline only once (cache in session state)
|
8 |
+
if "pipeline" not in st.session_state:
|
9 |
+
st.session_state.pipeline = CodeGenPipeline("hf://datasets/openai/openai_humaneval/openai_humaneval/test-00000-of-00001.parquet")
|
10 |
+
|
11 |
+
# Memory for chat messages
|
12 |
+
if "messages" not in st.session_state:
|
13 |
+
st.session_state.messages = []
|
14 |
+
|
15 |
+
# Display previous messages
|
16 |
+
for msg in st.session_state.messages:
|
17 |
+
if msg["role"] == "assistant":
|
18 |
+
st.chat_message("assistant").code(msg["content"], language="python")
|
19 |
+
else:
|
20 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
21 |
+
|
22 |
+
# User input
|
23 |
+
user_input = st.chat_input("Ask CodeGenBot to generate Python code...")
|
24 |
+
|
25 |
+
if user_input:
|
26 |
+
# Save user message
|
27 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
28 |
+
st.chat_message("user").write(user_input)
|
29 |
+
|
30 |
+
# Generate code using your pipeline
|
31 |
+
with st.spinner("Generating code..."):
|
32 |
+
try:
|
33 |
+
code_output = st.session_state.pipeline.generate_code_from_prompt(user_input)
|
34 |
+
except Exception as e:
|
35 |
+
code_output = f"Error: {e}"
|
36 |
+
|
37 |
+
# Save assistant's reply and rerun to display it immediately
|
38 |
+
st.session_state.messages.append({"role": "assistant", "content": code_output})
|
39 |
+
st.rerun()
|
pipeline/__init__.py
ADDED
File without changes
|
pipeline/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (176 Bytes). View file
|
|
pipeline/__pycache__/codegen.cpython-310.pyc
ADDED
Binary file (823 Bytes). View file
|
|
pipeline/__pycache__/embedding.cpython-310.pyc
ADDED
Binary file (777 Bytes). View file
|
|
pipeline/__pycache__/pipeline.cpython-310.pyc
ADDED
Binary file (1.49 kB). View file
|
|
pipeline/__pycache__/retrieval.cpython-310.pyc
ADDED
Binary file (1.07 kB). View file
|
|
pipeline/codegen.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
from huggingface_hub import InferenceClient
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
API_KEY = os.getenv("HF_API_KEY")
|
7 |
+
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
8 |
+
|
9 |
+
|
10 |
+
def generate_code_with_context(user_prompt, context=None):
|
11 |
+
client = InferenceClient(provider="nscale", api_key=API_KEY)
|
12 |
+
if context:
|
13 |
+
final_prompt = f"{context}\n\n# Your Task:\n{user_prompt}\n"
|
14 |
+
else:
|
15 |
+
final_prompt = user_prompt
|
16 |
+
completion = client.chat.completions.create(
|
17 |
+
model=MODEL_NAME,
|
18 |
+
messages=[{"role": "user", "content": final_prompt}],
|
19 |
+
)
|
20 |
+
return completion.choices[0].message.content
|
pipeline/embedding.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
|
3 |
+
class Embedder:
|
4 |
+
def __init__(self, model_name='all-MiniLM-L6-v2'):
|
5 |
+
self.model = SentenceTransformer(model_name)
|
6 |
+
|
7 |
+
def encode(self, texts, batch_size=32, show_progress_bar=False):
|
8 |
+
return self.model.encode(texts, batch_size=batch_size, show_progress_bar=show_progress_bar)
|
pipeline/pipeline.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from .embedding import Embedder
|
3 |
+
from .retrieval import RetrievalDB
|
4 |
+
from .codegen import generate_code_with_context
|
5 |
+
class CodeGenPipeline:
|
6 |
+
def __init__(self, parquet_path):
|
7 |
+
self.df = pd.read_parquet(parquet_path)
|
8 |
+
self.prompts = self.df["prompt"].tolist()
|
9 |
+
self.solutions = self.df["canonical_solution"].tolist()
|
10 |
+
self.embedder = Embedder()
|
11 |
+
self.embeddings = self.embedder.encode(self.prompts, batch_size=32, show_progress_bar=True)
|
12 |
+
self.retrieval_db = RetrievalDB(self.prompts, self.embeddings, self.solutions)
|
13 |
+
|
14 |
+
def generate_code_from_prompt(self, user_prompt, k=1):
|
15 |
+
query_emb = self.embedder.encode([user_prompt])[0]
|
16 |
+
retrieved = self.retrieval_db.retrieve_similar_context(query_emb, k=k)[0]
|
17 |
+
context = "\n\n".join([f"# Task:\n{r['prompt']}\n{r['solution']}" for r in retrieved])
|
18 |
+
return generate_code_with_context(user_prompt, context)
|
pipeline/retrieval.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
|
3 |
+
class RetrievalDB:
|
4 |
+
def __init__(self, prompts, embeddings, solutions, collection_name="humaneval"):
|
5 |
+
self.client = chromadb.Client()
|
6 |
+
self.collection = self.client.create_collection(name=collection_name)
|
7 |
+
for idx, (emb, prompt, solution) in enumerate(zip(embeddings, prompts, solutions)):
|
8 |
+
self.collection.add(
|
9 |
+
ids=[str(idx)],
|
10 |
+
embeddings=[emb.tolist()],
|
11 |
+
metadatas=[{"prompt": prompt, "solution": solution}]
|
12 |
+
)
|
13 |
+
|
14 |
+
def retrieve_similar_context(self, query_emb, k=1):
|
15 |
+
results = self.collection.query(query_embeddings=[query_emb], n_results=k)
|
16 |
+
return results["metadatas"]
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
sentence-transformers
|
3 |
+
chromadb
|
4 |
+
huggingface_hub
|
5 |
+
pyarrow
|