-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLP_ASSIG.py
More file actions
113 lines (95 loc) · 4.04 KB
/
NLP_ASSIG.py
File metadata and controls
113 lines (95 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
NLP Short Assignment – Sentiment Analysis
-----------------------------------------
1. Dataset Prep
2. Prompt Engineering
3. Evaluation
4. Troubleshooting
"""
# =============================
# 1) NLP & Dataset Prep
# =============================
import re
# Mini IMDb-style dataset (public-domain review style, small for demo)
DATASET = [
{"text": "Absolutely loved it. The performances were outstanding!", "label": 1},
{"text": "Terrible movie. Boring plot and a waste of time.", "label": 0},
{"text": "What a delightful surprise! Smart writing and heartfelt moments.", "label": 1},
{"text": "I wanted to like it, but it was not good. Confusing and slow.", "label": 0},
{"text": "Great soundtrack and visuals. I had a great time watching it!", "label": 1},
{"text": "This is bad. The jokes never land and the pacing is awful.", "label": 0},
]
# Cleaning + tokenization
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", "", text)
return text.strip()
def tokenize(text: str):
return clean_text(text).split()
print("Sample cleaned review:", clean_text(DATASET[0]["text"]))
# =============================
# 2) Prompt Engineering & Model
# =============================
try:
from transformers import pipeline
clf = pipeline("sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english")
use_llm = True
except Exception:
# Fallback heuristic classifier if transformers not available
POS = {"love", "great", "wonderful", "delightful", "smart", "outstanding"}
NEG = {"terrible", "boring", "bad", "awful", "waste", "slow"}
def clf(text):
tokens = tokenize(text)
score = sum(1 for t in tokens if t in POS) - sum(1 for t in tokens if t in NEG)
return [{"label": "POSITIVE" if score >= 0 else "NEGATIVE",
"score": abs(score) / 3}]
use_llm = False
# Example text for prompts
sample_text = DATASET[0]["text"]
# Three prompt variations
prompts = [
f"Classify the sentiment of this review: {sample_text}",
f"Is this movie review positive or negative? Review: {sample_text}",
f"Analyze and return JSON with 'label' and 'confidence'. Text: {sample_text}",
]
print("\n--- Prompt Engineering Outputs ---")
for i, p in enumerate(prompts, 1):
out = clf(sample_text)[0]
if i == 3:
print(f"Prompt {i} -> {{'label': '{out['label']}', 'confidence': {out['score']:.2f}}}")
else:
print(f"Prompt {i} -> {out['label']} (conf {out['score']:.2f})")
# =============================
# 3) Evaluation
# =============================
y_true = [d["label"] for d in DATASET]
y_pred = [1 if clf(d["text"])[0]["label"] == "POSITIVE" else 0 for d in DATASET]
# Metrics (pure python version)
def compute_metrics(y_true, y_pred):
tp = sum(t == p == 1 for t, p in zip(y_true, y_pred))
tn = sum(t == p == 0 for t, p in zip(y_true, y_pred))
fp = sum(t == 0 and p == 1 for t, p in zip(y_true, y_pred))
fn = sum(t == 1 and p == 0 for t, p in zip(y_true, y_pred))
acc = (tp + tn) / len(y_true)
prec = tp / (tp + fp) if tp + fp else 0
rec = tp / (tp + fn) if tp + fn else 0
f1 = 2 * prec * rec / (prec + rec) if prec + rec else 0
return acc, prec, rec, f1
acc, prec, rec, f1 = compute_metrics(y_true, y_pred)
print("\n--- Evaluation Metrics ---")
print(f"Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")
# =============================
# 4) Troubleshooting
# =============================
print("\n--- Troubleshooting Note ---")
print("Issue: Sarcasm and negation (e.g., 'yeah right, great movie') confuse models.")
print("Fix: Add sarcastic examples to dataset, or prompt model to consider sarcasm explicitly.")
# =============================
# Sanity Tests
# =============================
def run_tests():
assert clean_text(" Hello!! ") == "hello"
assert tokenize("Not great!!") == ["not", "great"]
print("All tests passed ✅")
run_tests()