-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproperty_data.py
More file actions
134 lines (111 loc) · 4.88 KB
/
property_data.py
File metadata and controls
134 lines (111 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import logging
from typing import List, Dict, Optional
import numpy as np
import requests
from io import BytesIO
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
# Set up logging configuration
logging.basicConfig(level=logging.INFO)
class PropertyData:
"""
Handles property data and embedding generation with proper normalization.
"""
def __init__(self):
# Initialize text and image embedding models
self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
self.image_model = SentenceTransformer('clip-ViT-B-32')
def preprocess_text(self, text: str) -> str:
"""
Normalize and clean text data.
Args:
text (str): The text to preprocess.
Returns:
str: The preprocessed text.
"""
return text.lower().strip()
def generate_text_embeddings(self, property_data: Dict) -> Dict[str, np.ndarray]:
"""
Generate normalized text embeddings for different property attributes.
Args:
property_data (Dict): The property data.
Returns:
Dict[str, np.ndarray]: A dictionary of normalized embeddings.
"""
for key, value in property_data.items():
if isinstance(value, str) and value is None:
property_data[key] = ''
# Prepare text data for embeddings
location_text = self.preprocess_text(
# f"{property_data.get('location_description', '')} "
# f"{property_data.get('neighborhood', '')} "
f"{property_data.get('city', '')} "
# f"{property_data.get('municipality', '')} "
f"{property_data.get('county_or_parish', '')} "
f"{property_data.get('state_or_province', '')} "
f"{property_data.get('country', '')}"
)
property_features_text = self.preprocess_text(" ".join([
*property_data.get('association_amenities', []),
*property_data.get('interior_features', []),
*property_data.get('exterior_features', []),
*property_data.get('appliances', []),
*property_data.get('lot_features', []),
f"property_type: {property_data.get('lp_property_type', '')}",
f"architectural_style: {property_data.get('architectural_style', '')}",
f"lp_sale_lease: {property_data.get('lp_sale_lease', '')}",
# property_data.get('lp_listing_description', 'no description'),
*property_data.get('accessibility_features', []),
*property_data.get('building_features', []),
*property_data.get('fireplace_features', []),
*property_data.get('laundry_features', []),
*property_data.get('parking_features', []),
*property_data.get('pool_features', []),
*property_data.get('security_features', []),
*property_data.get('waterfront_features', []),
]))
description_text = self.preprocess_text(
property_data.get('lp_listing_description', ''),
)
# Generate embeddings
embeddings = {
"location": self.text_model.encode(location_text),
"features": self.text_model.encode(property_features_text),
"description": self.text_model.encode(description_text)
}
# Normalize embeddings
for key in embeddings:
embeddings[key] = normalize(embeddings[key].reshape(1, -1))[0]
return embeddings
def generate_image_embedding(self, image_urls: List[str]) -> Optional[np.ndarray]:
"""
Generate a normalized aggregated image embedding from property photos.
Args:
image_urls (List[str]): A list of image URLs.
Returns:
Optional[np.ndarray]: The aggregated image embedding, or None if failed.
"""
embeddings = []
for url in image_urls[:5]: # Limit to first 5 images
try:
# Download image
response = requests.get(url, timeout=10)
response.raise_for_status()
img = Image.open(BytesIO(response.content)).convert('RGB')
# Generate embedding
embedding = self.image_model.encode(img)
embedding = normalize(embedding.reshape(1, -1))[0]
embeddings.append(embedding)
except requests.exceptions.RequestException as e:
logging.warning(f"Error fetching image {url}: {e}")
continue
except Exception as e:
logging.warning(f"Error processing image {url}: {e}")
continue
if not embeddings:
return None
# Aggregate embeddings by computing the mean
mean_embedding = np.mean(embeddings, axis=0)
mean_embedding = normalize(mean_embedding.reshape(1, -1))[0]
return mean_embedding