Skip to content

Commit d4e30f8

Browse files
author
huxinliang8888
committed
0-commit
0 parents  commit d4e30f8

7 files changed

Lines changed: 367 additions & 0 deletions

File tree

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/spider.iml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

Lines changed: 48 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

moveinfo.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# -*- coding: utf-8 -*-
2+
# coding=utf-8
3+
import sys
4+
import MySQLdb
5+
reload(sys)
6+
sys.setdefaultencoding('utf-8')
7+
8+
host = 'localhost'
9+
user = 'root'
10+
password = 'admin'
11+
database = 'test'
12+
13+
class InfoError(Exception):
14+
def __init__(self,value):
15+
self.value = value
16+
17+
class BasicInfo(object):
18+
def __init__(self,info ):
19+
self.id = 0
20+
self.name = ''
21+
self.director = ''
22+
self.country = ''
23+
self.tscore = '0'
24+
self.staring = ''
25+
self.language = ''
26+
self.type = ''
27+
self.alias = ''
28+
self.year = ''
29+
self.num = '0'
30+
try:
31+
self.id = info['index']
32+
self.name = info['名字']
33+
self.director = info['导演']
34+
self.country = info['制片国家/地区']
35+
self.tscore = info['总评分']
36+
self.staring = info['主演']
37+
self.language = info['语言']
38+
self.year = info['出品时间']
39+
self.type = info['类型']
40+
self.alias = info['又名']
41+
self.num = info['评分人数']
42+
except:
43+
pass
44+
#raise InfoError(e.message +'信息格式错误,无法正确解析')
45+
46+
def __str__(self):
47+
info = ""
48+
info += "索引:%s\n" % self.id
49+
info += "名字:%s\n" % self.name
50+
info += "导演:%s\n" % self.director
51+
info += "主演:%s\n" % self.staring
52+
info += "总评分:%s分\n" % self.tscore
53+
info += "评分人数:%s\n" % self.num
54+
info += "制片国家/地区:%s\n" % self.country
55+
info += "语言:%s\n" % self.language
56+
info += "类型:%s\n" % self.type
57+
info += "出品时间:%s\n" % self.year
58+
info += "又名:%s\n" % self.alias
59+
return info
60+
61+
def init_database():
62+
conn = MySQLdb.connect(host,user,password,database,charset = 'utf8')
63+
cursor = conn.cursor()
64+
return conn,cursor
65+
66+
def create_table(db,cursor):
67+
CREATE = '''
68+
CREATE TABLE IF NOT EXISTS movie_info (
69+
id INT NOT NULL AUTO_INCREMENT,
70+
name VARCHAR(256) NOT NULL,
71+
director VARCHAR(256) NOT NULL,
72+
country VARCHAR(256) NOT NULL,
73+
tscore VARCHAR(10) NOT NULL,
74+
num VARCHAR(20) NOT NULL,
75+
staring VARCHAR(512) NOT NULL,
76+
language VARCHAR(40) NOT NULL,
77+
year VARCHAR(20) NOT NULL,
78+
type VARCHAR(256) NOT NULL,
79+
alias VARCHAR(256) NOT NULL,
80+
PRIMARY KEY(id)
81+
) default charset=utf8;
82+
'''
83+
try:
84+
cursor.execute(CREATE)
85+
db.commit()
86+
except Exception as e:
87+
print e
88+
db.rollback()
89+
def save(db,cursor,movieInfo):
90+
filed = movieInfo.__dict__
91+
keyList = filed.keys()
92+
key = ''
93+
for i in keyList:
94+
key += str(i) + ','
95+
key = key[:-1]
96+
value = ''
97+
valueList = filed.values()
98+
for i in valueList:
99+
if isinstance(i,int):
100+
value += str(i) + ','
101+
if isinstance(i,str):
102+
value += '\"' + i.strip() + '\"' + ','
103+
value = value[:-1]
104+
INSERT = "INSERT INTO movie_info ( %s ) VALUES( %s )" %(key,value)
105+
print INSERT
106+
try:
107+
cursor.execute(INSERT)
108+
db.commit()
109+
print "insert succes!"
110+
except Exception as e:
111+
print e
112+
db.rollback()
113+
'''
114+
def test():
115+
db,cursor = init_database()
116+
#create_table(db,cursor)
117+
#save(cursor,BasicInfo(None))
118+
cursor.execute("select * from movie_info")
119+
f = cursor.fetchall()
120+
for i in f:
121+
print f
122+
db.close()
123+
test()
124+
'''
125+
126+
import urllib2
127+
import zlib
128+
def getProxy():
129+
httpproxy_handler = urllib2.ProxyHandler({"anonymous": "61.135.217.7:8080"})
130+
opener = urllib2.build_opener(httpproxy_handler)
131+
request = urllib2.Request("http://www.baidu.com/")
132+
133+
# 使用opener.open()方法发送请求才使用自定义的代理,而urlopen()则不使用自定义代理。
134+
response = opener.open(request)
135+
html = response.read()
136+
# 就是将opener应用到全局,之后所有的,不管是opener.open()还是urlopen() 发送请求,都将使用自定义代理。
137+
# urllib2.install_opener(opener)
138+
# response = urlopen(request)
139+
if response.headers['Content-Encoding'] == 'gzip':
140+
html = zlib.decompress(html,16+zlib.MAX_WBITS)
141+
print html
142+
143+
#getProxy()
144+

moveinfo.pyc

4.46 KB
Binary file not shown.

movie.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1291000

spider.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# -*- coding: utf-8 -*-
2+
import urllib2
3+
import zlib
4+
from bs4 import BeautifulSoup
5+
import lxml
6+
import re
7+
import sys
8+
import time
9+
import random
10+
import math
11+
from moveinfo import *
12+
reload(sys)
13+
sys.setdefaultencoding('utf-8')
14+
req_headers = {
15+
'Host': 'movie.douban.com',
16+
'User-Agent': 'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:58.0) Gecko/20100101 Firefox/58.0',
17+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18+
'Accept-Language': 'zh,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
19+
'Accept-Encoding': 'gzip,deflate',
20+
'Connection': 'keep-alive',
21+
'Upgrade-Insecure-Requests': '1'
22+
}
23+
24+
def get(proxy,url):
25+
url = url
26+
httpproxy_handler = urllib2.ProxyHandler(proxy)
27+
opener = urllib2.build_opener(httpproxy_handler)
28+
request = urllib2.Request(url, headers=req_headers)
29+
response = None
30+
try:
31+
response = opener.open(request)
32+
except urllib2.URLError as e:
33+
errorcode = None
34+
reason = None
35+
if hasattr(e, 'code'):
36+
errorcode = e.code
37+
if hasattr(e, 'reason'):
38+
reason = e.reason
39+
return None,errorcode,reason
40+
except Exception as e:
41+
print e
42+
return None, None, None
43+
html = response.read()
44+
if response.headers['Content-Encoding'] == 'gzip':
45+
html = zlib.decompress(html,16+zlib.MAX_WBITS)
46+
return html,None,None
47+
def getMovieInfo(html):
48+
movieInfo = {}
49+
soup = BeautifulSoup(html,'lxml')
50+
name = soup.find('span', property="v:itemreviewed").text
51+
movieInfo['名字'] = str(name)
52+
53+
year = soup.find('span',class_="year").text[1:-1]
54+
movieInfo['出品时间'] = str(year)
55+
info = soup.find('div',id='info').get_text()
56+
infolines = str(info).replace(':\n',':').lstrip().splitlines()
57+
for line in infolines:
58+
lineSplit = line.split(':')
59+
lineValue = ''
60+
for i in range(1,len(lineSplit)):
61+
lineValue += lineSplit[i]
62+
movieInfo[lineSplit[0]] = lineValue
63+
if soup.find('span' ,property="v:votes") != None:
64+
score = str(soup.find('strong' ,class_='ll rating_num').text)
65+
ratingNum = str(soup.find('span' ,property="v:votes").text)
66+
rating = [0] * 5
67+
parent = soup.find('div', class_="ratings-on-weight")
68+
items = parent.find_all('div', class_='item')
69+
index = 4
70+
for item in items:
71+
span = item.find_all('span')
72+
rating[index] = str(span[1].text.strip())
73+
index -= 1
74+
movieInfo['评分'] = rating
75+
else:
76+
score = '0.0'
77+
ratingNum = '0'
78+
movieInfo['总评分'] = score
79+
movieInfo['评分人数'] = ratingNum
80+
return movieInfo
81+
def runSpider(start,end,size = 10):
82+
db,cursor = init_database()
83+
create_table(db,cursor)
84+
url = "https://movie.douban.com/subject/"
85+
items = 0
86+
count = start-1
87+
counter = 0
88+
proxy = selectProxy()
89+
#proxy = {'https':'222.73.68.144:8090'}
90+
print 'select proxy:', proxy
91+
old_proxy = proxy
92+
index = start
93+
while index <= end:
94+
tempUrl = url + str(index)
95+
movieinfo = None
96+
if counter >= 1000:
97+
counter = 0
98+
while old_proxy == proxy:
99+
proxy = selectProxy()
100+
old_proxy = proxy
101+
print 'select proxy:',proxy
102+
html ,errorcode,reason= get(proxy,tempUrl)
103+
104+
counter +=1
105+
if str(errorcode) == '302':
106+
db.close()
107+
print 'get the last html index:%d'%index
108+
return
109+
if html == None:
110+
print 'Get HTML from %s failed! ' % tempUrl,'Error code:',errorcode,'reason:',reason
111+
if errorcode == None:
112+
proxy = selectProxy()
113+
print 'select proxy:', proxy
114+
continue
115+
print 'Get HTML from %s success!'% tempUrl
116+
try:
117+
movieinfo = getMovieInfo(html)
118+
movieinfo['index'] = index
119+
print BasicInfo(movieinfo)
120+
save(db,cursor,BasicInfo(movieinfo))
121+
index +=1
122+
except Exception as e:
123+
print e
124+
continue
125+
db.close()
126+
127+
def getproxy():
128+
url = 'http://www.xicidaili.com/wn'
129+
header = req_headers
130+
header['Host'] ='www.xicidaili.com'
131+
request = urllib2.Request(url ,headers=header)
132+
response = urllib2.urlopen(request)
133+
html = response.read()
134+
if response.headers['Content-Encoding'] == 'gzip':
135+
html = zlib.decompress(html,16+zlib.MAX_WBITS)
136+
soup = BeautifulSoup(html, 'lxml')
137+
trs = soup.find('table',id="ip_list").find_all('tr')
138+
num = len(trs)
139+
proxy = []
140+
for i in range(1,num):
141+
tds = trs[i].find_all('td')
142+
if str(tds[4].get_text()) == '高匿':
143+
proxy.append({str(tds[5].get_text()).lower():str(tds[1].get_text()+':'+tds[2].get_text())})
144+
return proxy
145+
146+
def selectProxy():
147+
proxy = getproxy()
148+
num = len(proxy)
149+
index = int(random.uniform(0,num))
150+
return proxy[index]
151+
152+
def printInfo(movieinfo):
153+
for key in movieinfo.keys():
154+
print key,movieinfo[key]
155+
runSpider(1295096,1295096+10000)
156+
#html ,s,t= get("https://movie.douban.com/subject/1309046")
157+
#movieinfo = getMovieInfo(html)
158+
#getproxy()

0 commit comments

Comments
 (0)