-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathlianjia.py
More file actions
144 lines (125 loc) · 5.38 KB
/
lianjia.py
File metadata and controls
144 lines (125 loc) · 5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# coding:utf8
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from basecrawler import BaseCrawler, BeautifulSoup
import pymysql
from collections import OrderedDict
import re
import pymongo
class LianJia(BaseCrawler):
def __init__(self):
self.mysqldb = pymysql.connect("127.0.0.1", "root", "", "xiaoqu")
self.cursor = self.mysqldb.cursor()
self.mysqldb.charset = "utf8"
self.cursor.execute("set names utf8mb4")
self.cursor.execute('set names utf8mb4')
self.cursor.execute('SET CHARACTER SET utf8mb4;')
self.cursor.execute('SET character_set_connection=utf8mb4;')
self.mongo = pymongo.MongoClient(['127.0.0.1:27017'], maxPoolSize=10)
self.mongodb = self.mongo["wechat"]
super(LianJia, self).__init__()
def run(self):
page = 1
while 1:
url = "https://bj.lianjia.com/xiaoqu/pg%s/" % page
print page
self.get_content_urls(url)
page += 1
if page == 400:
break
def get_content_urls(self, url):
try:
resp = self.requests_get(url)
except Exception as e:
self.logger.error("req failure: ", str(e))
return
soup = BeautifulSoup(resp.text, 'lxml')
for node_a in soup.select('div.info div.title a'):
try:
content_url = node_a["href"]
except Exception as e:
print str(e)
continue
print content_url
data = self.get_content(content_url)
if not data:
data = OrderedDict()
data["list_url"] = url
data["content_url"] = content_url
self.insert_data(data)
def get_content(self, url):
try:
resp = self.requests_get(url)
except Exception as e:
self.logger.error("req content page failure: ", str(e))
return
soup = BeautifulSoup(resp.text, 'lxml')
data = OrderedDict()
try:
data["name"] = soup.select("h1.detailTitle")[0].get_text().strip()
except Exception as e:
self.logger.error("get name field failure: ", str(e))
try:
data["address"] = soup.select(("div.detailDesc"))[0].get_text().strip()
except Exception as e:
self.logger.error("get address field failure: ", str(e))
try:
data["price"] = soup.select("span.xiaoquUnitPrice")[0].get_text().strip()
except Exception as e:
self.logger.error("get price field failure: ", str(e))
try:
create_year = soup.select("div.xiaoquInfo > div:nth-of-type(1) > span.xiaoquInfoContent")[0].get_text().strip()
data["create_year"] = self.get_number(create_year)
except Exception as e:
self.logger.error("get create_year field failure: ", str(e))
try:
data["developer"] = soup.select("div.xiaoquInfo > div:nth-of-type(5) > span.xiaoquInfoContent")[0].get_text().strip()
except Exception as e:
self.logger.error("get developer field failure: ", str(e))
try:
buildings = soup.select("div.xiaoquInfo > div:nth-of-type(6) > span.xiaoquInfoContent")[0].get_text().strip()
data["buildings"] = self.get_number(buildings)
except Exception as e:
self.logger.error("get buildings field failure: ", str(e))
try:
total = soup.select("div.xiaoquInfo > div:nth-of-type(7) > span.xiaoquInfoContent")[0].get_text().strip()
data["total"] = self.get_number(total)
except Exception as e:
self.logger.error("get total field failure: ", str(e))
try:
data["province"] = soup.select("div.fl.l-txt a:nth-of-type(2)")[0].get_text().strip()
except Exception as e:
self.logger.error("get province field failure: ", str(e))
try:
data["city"] = soup.select("div.fl.l-txt a:nth-of-type(3)")[0].get_text().strip()
except Exception as e:
self.logger.error("get city field failure: ", str(e))
return data
def get_number(self, str):
res = re.search(r'\d+', str, flags=re.S).group()
return res
def insert_data(self, data):
try:
self.mongodb["xiaoqu"].insert(data)
except Exception as e:
self.logger.error("mongo insert failure: ", str(e))
sql = u'''insert into xiaoqu(id, province, city, `name`, total, price, create_year, developer, buildings, list_url, content_url, address)
VALUES (null, "{province}", "{city}", "{name}", {total}, {price}, {create_year}, "{developer}", {buildings}, "{list_url}", "{content_url}", "{address}")'''.format(
province=data.get("province", u"北京"),
city=data.get("city", u""),
name=data.get("name", u""),
total=data.get("total", 0),
price=data.get("price", 0),
create_year=data.get("create_year", 0),
developer=data.get("developer", u""),
buildings=data.get("buildings", 0),
list_url=data.get("list_url", u""),
content_url=data.get("content_url", u""),
address=data.get("address", u""),
)
self.cursor.execute(sql.encode("utf8"))
self.mysqldb.commit()
if __name__ == "__main__":
lj = LianJia()
lj.run()