GuglielmoTor commited on
Commit
16353a0
·
verified ·
1 Parent(s): 4d49280

Create posts_categorization.py

Browse files
Files changed (1) hide show
  1. posts_categorization.py +105 -0
posts_categorization.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from groq import Groq
3
+ import instructor
4
+ from pydantic import BaseModel
5
+ import os
6
+
7
+ api_key = os.getenv('GROQ_API_KEY')
8
+
9
+ # Create single patched Groq client with instructor for structured output
10
+ client = instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
11
+
12
+ class SummaryOutput(BaseModel):
13
+ summary: str
14
+
15
+ # Define pydantic schema for classification output
16
+ class ClassificationOutput(BaseModel):
17
+ category: str
18
+
19
+ # Summarize post text
20
+ def summarize_post(text):
21
+ if pd.isna(text) or text is None:
22
+ return None
23
+
24
+ text = str(text)[:2000] # truncate to avoid token overflow
25
+
26
+ prompt = f"""
27
+ Summarize the following LinkedIn post in 5 to 10 words.
28
+ Only return the summary inside a JSON field called 'summary'.
29
+
30
+ Post Text:
31
+ \"\"\"{text}\"\"\"
32
+ """
33
+
34
+ try:
35
+ response = client.chat.completions.create(
36
+ model="deepseek-r1-distill-llama-70b",
37
+ response_model=SummaryOutput,
38
+ messages=[
39
+ {"role": "system", "content": "You are a precise summarizer. Only return a JSON object with a 'summary' string."},
40
+ {"role": "user", "content": prompt}
41
+ ],
42
+ temperature=0.3
43
+ )
44
+ return response.summary
45
+ except Exception as e:
46
+ print(f"Summarization error: {e}")
47
+ return None
48
+
49
+
50
+ # Classify post summary into structured categories
51
+ def classify_post(summary, labels):
52
+ if pd.isna(summary) or summary is None:
53
+ return None
54
+
55
+ prompt = f"""
56
+ Post Summary: "{summary}"
57
+
58
+ Available Categories:
59
+ {', '.join(labels)}
60
+
61
+ Task: Choose the single most relevant category from the list above that applies to this summary. Return only one category in a structured JSON format under the field 'category'.
62
+ If no category applies, return 'None'.
63
+ """
64
+ try:
65
+ result = client.chat.completions.create(
66
+ model="meta-llama/llama-4-maverick-17b-128e-instruct",
67
+ response_model=ClassificationOutput,
68
+ messages=[
69
+ {"role": "system", "content": "You are a strict classifier. Return only one matching category name under the field 'category'."},
70
+ {"role": "user", "content": prompt}
71
+ ],
72
+ temperature=0.3,
73
+ max_tokens=60
74
+ )
75
+ return result.category
76
+ except Exception as e:
77
+ print(f"Classification error: {e}")
78
+ return None
79
+
80
+ def summarize_and_classify_post(text, labels):
81
+ summary = summarize_post(text)
82
+ category = classify_post(summary, labels) if summary else None
83
+ return {
84
+ "summary": summary,
85
+ "category": category
86
+ }
87
+
88
+ def batch_summarize_and_classify(posts, labels):
89
+
90
+ labels = [
91
+ "Company Culture and Values",
92
+ "Employee Stories and Spotlights",
93
+ "Work-Life Balance, Flexibility, and Well-being",
94
+ "Diversity, Equity, and Inclusion (DEI)",
95
+ "Professional Development and Growth Opportunities",
96
+ "Mission, Vision, and Social Responsibility",
97
+ "None"
98
+ ]
99
+
100
+ results = []
101
+ for post in posts:
102
+ text = post.get("text")
103
+ result = summarize_and_classify_post(text, labels)
104
+ results.append(result)
105
+ return results