Spaces:
Sleeping
Sleeping
Create posts_categorization.py
Browse files- posts_categorization.py +105 -0
posts_categorization.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from groq import Groq
|
3 |
+
import instructor
|
4 |
+
from pydantic import BaseModel
|
5 |
+
import os
|
6 |
+
|
7 |
+
api_key = os.getenv('GROQ_API_KEY')
|
8 |
+
|
9 |
+
# Create single patched Groq client with instructor for structured output
|
10 |
+
client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
|
11 |
+
|
12 |
+
class SummaryOutput(BaseModel):
|
13 |
+
summary: str
|
14 |
+
|
15 |
+
# Define pydantic schema for classification output
|
16 |
+
class ClassificationOutput(BaseModel):
|
17 |
+
category: str
|
18 |
+
|
19 |
+
# Summarize post text
|
20 |
+
def summarize_post(text):
|
21 |
+
if pd.isna(text) or text is None:
|
22 |
+
return None
|
23 |
+
|
24 |
+
text = str(text)[:2000] # truncate to avoid token overflow
|
25 |
+
|
26 |
+
prompt = f"""
|
27 |
+
Summarize the following LinkedIn post in 5 to 10 words.
|
28 |
+
Only return the summary inside a JSON field called 'summary'.
|
29 |
+
|
30 |
+
Post Text:
|
31 |
+
\"\"\"{text}\"\"\"
|
32 |
+
"""
|
33 |
+
|
34 |
+
try:
|
35 |
+
response = client.chat.completions.create(
|
36 |
+
model="deepseek-r1-distill-llama-70b",
|
37 |
+
response_model=SummaryOutput,
|
38 |
+
messages=[
|
39 |
+
{"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
|
40 |
+
{"role": "user", "content": prompt}
|
41 |
+
],
|
42 |
+
temperature=0.3
|
43 |
+
)
|
44 |
+
return response.summary
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Summarization error: {e}")
|
47 |
+
return None
|
48 |
+
|
49 |
+
|
50 |
+
# Classify post summary into structured categories
|
51 |
+
def classify_post(summary, labels):
|
52 |
+
if pd.isna(summary) or summary is None:
|
53 |
+
return None
|
54 |
+
|
55 |
+
prompt = f"""
|
56 |
+
Post Summary: "{summary}"
|
57 |
+
|
58 |
+
Available Categories:
|
59 |
+
{', '.join(labels)}
|
60 |
+
|
61 |
+
Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
|
62 |
+
If no category applies, return 'None'.
|
63 |
+
"""
|
64 |
+
try:
|
65 |
+
result = client.chat.completions.create(
|
66 |
+
model="meta-llama/llama-4-maverick-17b-128e-instruct",
|
67 |
+
response_model=ClassificationOutput,
|
68 |
+
messages=[
|
69 |
+
{"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
|
70 |
+
{"role": "user", "content": prompt}
|
71 |
+
],
|
72 |
+
temperature=0.3,
|
73 |
+
max_tokens=60
|
74 |
+
)
|
75 |
+
return result.category
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Classification error: {e}")
|
78 |
+
return None
|
79 |
+
|
80 |
+
def summarize_and_classify_post(text, labels):
|
81 |
+
summary = summarize_post(text)
|
82 |
+
category = classify_post(summary, labels) if summary else None
|
83 |
+
return {
|
84 |
+
"summary": summary,
|
85 |
+
"category": category
|
86 |
+
}
|
87 |
+
|
88 |
+
def batch_summarize_and_classify(posts, labels):
|
89 |
+
|
90 |
+
labels = [
|
91 |
+
"Company Culture and Values",
|
92 |
+
"Employee Stories and Spotlights",
|
93 |
+
"Work-Life Balance, Flexibility, and Well-being",
|
94 |
+
"Diversity, Equity, and Inclusion (DEI)",
|
95 |
+
"Professional Development and Growth Opportunities",
|
96 |
+
"Mission, Vision, and Social Responsibility",
|
97 |
+
"None"
|
98 |
+
]
|
99 |
+
|
100 |
+
results = []
|
101 |
+
for post in posts:
|
102 |
+
text = post.get("text")
|
103 |
+
result = summarize_and_classify_post(text, labels)
|
104 |
+
results.append(result)
|
105 |
+
return results
|