Skip to content

Commit 001ad1d

Browse files
author
huxinliang8888
committed
v1.0.0
1 parent d4e30f8 commit 001ad1d

2 files changed

Lines changed: 118 additions & 55 deletions

File tree

moveinfo.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,20 @@ def __init__(self,info ):
2727
self.alias = ''
2828
self.year = ''
2929
self.num = '0'
30+
self.tag = ''
3031
try:
31-
self.id = info['index']
32-
self.name = info['名字']
33-
self.director = info['导演']
34-
self.country = info['制片国家/地区']
35-
self.tscore = info['总评分']
36-
self.staring = info['主演']
37-
self.language = info['语言']
38-
self.year = info['出品时间']
39-
self.type = info['类型']
40-
self.alias = info['又名']
41-
self.num = info['评分人数']
32+
self.id = info['索引'].replace(' / ','/').strip()
33+
self.tag = info['类别'].replace(' / ','/').strip()
34+
self.name = info['名字'].replace(' / ','/').strip()
35+
self.director = info['导演'].replace(' / ','/').strip()
36+
self.country = info['制片国家/地区'].replace(' / ','/').strip()
37+
self.tscore = info['总评分'].replace(' / ','/').strip()
38+
self.staring = info['主演'].replace(' / ','/').strip()
39+
self.language = info['语言'].replace(' / ','/').strip()
40+
self.year = info['出品时间'].replace(' / ','/').strip()
41+
self.type = info['类型'].replace(' / ','/').strip()
42+
self.alias = info['又名'].replace(' / ','/').strip()
43+
self.num = info['评分人数'].replace(' / ','/').strip()
4244
except:
4345
pass
4446
#raise InfoError(e.message +'信息格式错误,无法正确解析')
@@ -47,6 +49,7 @@ def __str__(self):
4749
info = ""
4850
info += "索引:%s\n" % self.id
4951
info += "名字:%s\n" % self.name
52+
info += "分类:%s\n" % self.tag
5053
info += "导演:%s\n" % self.director
5154
info += "主演:%s\n" % self.staring
5255
info += "总评分:%s分\n" % self.tscore
@@ -68,6 +71,7 @@ def create_table(db,cursor):
6871
CREATE TABLE IF NOT EXISTS movie_info (
6972
id INT NOT NULL AUTO_INCREMENT,
7073
name VARCHAR(256) NOT NULL,
74+
tag VARCHAR(10) NOT NULL,
7175
director VARCHAR(256) NOT NULL,
7276
country VARCHAR(256) NOT NULL,
7377
tscore VARCHAR(10) NOT NULL,
@@ -96,10 +100,10 @@ def save(db,cursor,movieInfo):
96100
value = ''
97101
valueList = filed.values()
98102
for i in valueList:
99-
if isinstance(i,int):
100-
value += str(i) + ','
101103
if isinstance(i,str):
102-
value += '\"' + i.strip() + '\"' + ','
104+
value += '\"' + i.replace(' ','') + '\"' + ','
105+
else:
106+
value += str(i) + ','
103107
value = value[:-1]
104108
INSERT = "INSERT INTO movie_info ( %s ) VALUES( %s )" %(key,value)
105109
print INSERT

spider.py

Lines changed: 100 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import random
1010
import math
1111
from moveinfo import *
12+
import json
1213
reload(sys)
1314
sys.setdefaultencoding('utf-8')
1415
req_headers = {
@@ -17,15 +18,53 @@
1718
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1819
'Accept-Language': 'zh,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
1920
'Accept-Encoding': 'gzip,deflate',
20-
'Connection': 'keep-alive',
21+
#'Connection': 'keep-alive',
2122
'Upgrade-Insecure-Requests': '1'
2223
}
23-
24-
def get(proxy,url):
24+
UserAgent = ["Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:58.0) Gecko/20100101 Firefox/58.0",
25+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
26+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
27+
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
28+
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
29+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
30+
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
31+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
32+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
33+
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
34+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
35+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
36+
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
37+
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
38+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
39+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
40+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
41+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
42+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
43+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
44+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
45+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
46+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
47+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
48+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
49+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
50+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
51+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
52+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
53+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
54+
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
55+
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
56+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
57+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
58+
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
59+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
60+
]
61+
def getHTML(proxy,url):
2562
url = url
2663
httpproxy_handler = urllib2.ProxyHandler(proxy)
2764
opener = urllib2.build_opener(httpproxy_handler)
28-
request = urllib2.Request(url, headers=req_headers)
65+
header = req_headers
66+
header['User-Agent'] = selectUserAgent()
67+
request = urllib2.Request(url, headers=header)
2968
response = None
3069
try:
3170
response = opener.open(request)
@@ -44,11 +83,12 @@ def get(proxy,url):
4483
if response.headers['Content-Encoding'] == 'gzip':
4584
html = zlib.decompress(html,16+zlib.MAX_WBITS)
4685
return html,None,None
86+
4787
def getMovieInfo(html):
4888
movieInfo = {}
4989
soup = BeautifulSoup(html,'lxml')
50-
name = soup.find('span', property="v:itemreviewed").text
51-
movieInfo['名字'] = str(name)
90+
#name = soup.find('span', property="v:itemreviewed").text
91+
#movieInfo['名字'] = str(name)
5292

5393
year = soup.find('span',class_="year").text[1:-1]
5494
movieInfo['出品时间'] = str(year)
@@ -78,53 +118,39 @@ def getMovieInfo(html):
78118
movieInfo['总评分'] = score
79119
movieInfo['评分人数'] = ratingNum
80120
return movieInfo
81-
def runSpider(start,end,size = 10):
121+
122+
def grabInfo(urlList,proxy):
82123
db,cursor = init_database()
83124
create_table(db,cursor)
84-
url = "https://movie.douban.com/subject/"
85-
items = 0
86-
count = start-1
87-
counter = 0
88-
proxy = selectProxy()
89-
#proxy = {'https':'222.73.68.144:8090'}
90-
print 'select proxy:', proxy
91-
old_proxy = proxy
92-
index = start
93-
while index <= end:
94-
tempUrl = url + str(index)
125+
for item in urlList:
95126
movieinfo = None
96-
if counter >= 1000:
97-
counter = 0
98-
while old_proxy == proxy:
99-
proxy = selectProxy()
100-
old_proxy = proxy
101-
print 'select proxy:',proxy
102-
html ,errorcode,reason= get(proxy,tempUrl)
103-
104-
counter +=1
105-
if str(errorcode) == '302':
106-
db.close()
107-
print 'get the last html index:%d'%index
108-
return
127+
url = item['url']
128+
id = item['id']
129+
tag = item['tag']
130+
name = item['name']
131+
html ,errorcode,reason= getHTML(proxy,url)
109132
if html == None:
110-
print 'Get HTML from %s failed! ' % tempUrl,'Error code:',errorcode,'reason:',reason
111-
if errorcode == None:
112-
proxy = selectProxy()
113-
print 'select proxy:', proxy
133+
print 'Get HTML from %s failed! ' % url,'Error code:',errorcode,'reason:',reason
114134
continue
115-
print 'Get HTML from %s success!'% tempUrl
135+
print 'Get HTML from %s success!'% url
116136
try:
117137
movieinfo = getMovieInfo(html)
118-
movieinfo['index'] = index
138+
movieinfo['索引'] = id
139+
movieinfo['类别'] = tag
140+
movieinfo['名字'] = name
119141
print BasicInfo(movieinfo)
120142
save(db,cursor,BasicInfo(movieinfo))
121-
index +=1
122143
except Exception as e:
123144
print e
124145
continue
146+
delay_time(3)
125147
db.close()
126148

127-
def getproxy():
149+
def delay_time(t):
150+
delay= random.uniform(0,t)
151+
time.sleep(delay)
152+
153+
def getProxyPool():
128154
url = 'http://www.xicidaili.com/wn'
129155
header = req_headers
130156
header['Host'] ='www.xicidaili.com'
@@ -144,15 +170,48 @@ def getproxy():
144170
return proxy
145171

146172
def selectProxy():
147-
proxy = getproxy()
173+
proxy = getProxyPool()
148174
num = len(proxy)
149175
index = int(random.uniform(0,num))
150176
return proxy[index]
151177

178+
def selectUserAgent():
179+
num = len(UserAgent)
180+
index = int(random.uniform(0, num))
181+
return UserAgent[index]
182+
152183
def printInfo(movieinfo):
153184
for key in movieinfo.keys():
154185
print key,movieinfo[key]
155-
runSpider(1295096,1295096+10000)
186+
187+
188+
def runSpider(tag = ['电影'],startPage = 0,Pagenum = 100,size = 100):
189+
tag = ['电影']
190+
pageSize = 20
191+
urllist = []
192+
counter = 0
193+
for i in range(0,len(tag)):
194+
for j in range(startPage,Pagenum):
195+
url = "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%s&start=%d"%(tag[i],j*pageSize)
196+
print 'grab Tag:%s Page:%d' %(tag[i],j*pageSize)
197+
request = urllib2.Request(url)
198+
try:
199+
response = urllib2.urlopen(request)
200+
data = response.read()
201+
data = json.loads(data)
202+
for k in data['data']:
203+
urllist.append({'id':str(k['id']),'url':str(k['url']),'tag':tag[i],'name':str(k['title'])})
204+
counter += 1
205+
if counter % size == 0:
206+
grabInfo(urllist,{})
207+
urllist = []
208+
delay_time(10)
209+
except Exception as e:
210+
print e
211+
continue
212+
213+
#runSpider(1295096,1295096+10000)
156214
#html ,s,t= get("https://movie.douban.com/subject/1309046")
157215
#movieinfo = getMovieInfo(html)
158216
#getproxy()
217+
runSpider()

0 commit comments

Comments
 (0)