|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +import urllib2 |
| 3 | +import zlib |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import lxml |
| 6 | +import re |
| 7 | +import sys |
| 8 | +import time |
| 9 | +import random |
| 10 | +import math |
| 11 | +from moveinfo import * |
| 12 | +reload(sys) |
| 13 | +sys.setdefaultencoding('utf-8') |
| 14 | +req_headers = { |
| 15 | + 'Host': 'movie.douban.com', |
| 16 | + 'User-Agent': 'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:58.0) Gecko/20100101 Firefox/58.0', |
| 17 | + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 18 | + 'Accept-Language': 'zh,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', |
| 19 | + 'Accept-Encoding': 'gzip,deflate', |
| 20 | + 'Connection': 'keep-alive', |
| 21 | + 'Upgrade-Insecure-Requests': '1' |
| 22 | + } |
| 23 | + |
| 24 | +def get(proxy,url): |
| 25 | + url = url |
| 26 | + httpproxy_handler = urllib2.ProxyHandler(proxy) |
| 27 | + opener = urllib2.build_opener(httpproxy_handler) |
| 28 | + request = urllib2.Request(url, headers=req_headers) |
| 29 | + response = None |
| 30 | + try: |
| 31 | + response = opener.open(request) |
| 32 | + except urllib2.URLError as e: |
| 33 | + errorcode = None |
| 34 | + reason = None |
| 35 | + if hasattr(e, 'code'): |
| 36 | + errorcode = e.code |
| 37 | + if hasattr(e, 'reason'): |
| 38 | + reason = e.reason |
| 39 | + return None,errorcode,reason |
| 40 | + except Exception as e: |
| 41 | + print e |
| 42 | + return None, None, None |
| 43 | + html = response.read() |
| 44 | + if response.headers['Content-Encoding'] == 'gzip': |
| 45 | + html = zlib.decompress(html,16+zlib.MAX_WBITS) |
| 46 | + return html,None,None |
| 47 | +def getMovieInfo(html): |
| 48 | + movieInfo = {} |
| 49 | + soup = BeautifulSoup(html,'lxml') |
| 50 | + name = soup.find('span', property="v:itemreviewed").text |
| 51 | + movieInfo['名字'] = str(name) |
| 52 | + |
| 53 | + year = soup.find('span',class_="year").text[1:-1] |
| 54 | + movieInfo['出品时间'] = str(year) |
| 55 | + info = soup.find('div',id='info').get_text() |
| 56 | + infolines = str(info).replace(':\n',':').lstrip().splitlines() |
| 57 | + for line in infolines: |
| 58 | + lineSplit = line.split(':') |
| 59 | + lineValue = '' |
| 60 | + for i in range(1,len(lineSplit)): |
| 61 | + lineValue += lineSplit[i] |
| 62 | + movieInfo[lineSplit[0]] = lineValue |
| 63 | + if soup.find('span' ,property="v:votes") != None: |
| 64 | + score = str(soup.find('strong' ,class_='ll rating_num').text) |
| 65 | + ratingNum = str(soup.find('span' ,property="v:votes").text) |
| 66 | + rating = [0] * 5 |
| 67 | + parent = soup.find('div', class_="ratings-on-weight") |
| 68 | + items = parent.find_all('div', class_='item') |
| 69 | + index = 4 |
| 70 | + for item in items: |
| 71 | + span = item.find_all('span') |
| 72 | + rating[index] = str(span[1].text.strip()) |
| 73 | + index -= 1 |
| 74 | + movieInfo['评分'] = rating |
| 75 | + else: |
| 76 | + score = '0.0' |
| 77 | + ratingNum = '0' |
| 78 | + movieInfo['总评分'] = score |
| 79 | + movieInfo['评分人数'] = ratingNum |
| 80 | + return movieInfo |
| 81 | +def runSpider(start,end,size = 10): |
| 82 | + db,cursor = init_database() |
| 83 | + create_table(db,cursor) |
| 84 | + url = "https://movie.douban.com/subject/" |
| 85 | + items = 0 |
| 86 | + count = start-1 |
| 87 | + counter = 0 |
| 88 | + proxy = selectProxy() |
| 89 | + #proxy = {'https':'222.73.68.144:8090'} |
| 90 | + print 'select proxy:', proxy |
| 91 | + old_proxy = proxy |
| 92 | + index = start |
| 93 | + while index <= end: |
| 94 | + tempUrl = url + str(index) |
| 95 | + movieinfo = None |
| 96 | + if counter >= 1000: |
| 97 | + counter = 0 |
| 98 | + while old_proxy == proxy: |
| 99 | + proxy = selectProxy() |
| 100 | + old_proxy = proxy |
| 101 | + print 'select proxy:',proxy |
| 102 | + html ,errorcode,reason= get(proxy,tempUrl) |
| 103 | + |
| 104 | + counter +=1 |
| 105 | + if str(errorcode) == '302': |
| 106 | + db.close() |
| 107 | + print 'get the last html index:%d'%index |
| 108 | + return |
| 109 | + if html == None: |
| 110 | + print 'Get HTML from %s failed! ' % tempUrl,'Error code:',errorcode,'reason:',reason |
| 111 | + if errorcode == None: |
| 112 | + proxy = selectProxy() |
| 113 | + print 'select proxy:', proxy |
| 114 | + continue |
| 115 | + print 'Get HTML from %s success!'% tempUrl |
| 116 | + try: |
| 117 | + movieinfo = getMovieInfo(html) |
| 118 | + movieinfo['index'] = index |
| 119 | + print BasicInfo(movieinfo) |
| 120 | + save(db,cursor,BasicInfo(movieinfo)) |
| 121 | + index +=1 |
| 122 | + except Exception as e: |
| 123 | + print e |
| 124 | + continue |
| 125 | + db.close() |
| 126 | + |
| 127 | +def getproxy(): |
| 128 | + url = 'http://www.xicidaili.com/wn' |
| 129 | + header = req_headers |
| 130 | + header['Host'] ='www.xicidaili.com' |
| 131 | + request = urllib2.Request(url ,headers=header) |
| 132 | + response = urllib2.urlopen(request) |
| 133 | + html = response.read() |
| 134 | + if response.headers['Content-Encoding'] == 'gzip': |
| 135 | + html = zlib.decompress(html,16+zlib.MAX_WBITS) |
| 136 | + soup = BeautifulSoup(html, 'lxml') |
| 137 | + trs = soup.find('table',id="ip_list").find_all('tr') |
| 138 | + num = len(trs) |
| 139 | + proxy = [] |
| 140 | + for i in range(1,num): |
| 141 | + tds = trs[i].find_all('td') |
| 142 | + if str(tds[4].get_text()) == '高匿': |
| 143 | + proxy.append({str(tds[5].get_text()).lower():str(tds[1].get_text()+':'+tds[2].get_text())}) |
| 144 | + return proxy |
| 145 | + |
| 146 | +def selectProxy(): |
| 147 | + proxy = getproxy() |
| 148 | + num = len(proxy) |
| 149 | + index = int(random.uniform(0,num)) |
| 150 | + return proxy[index] |
| 151 | + |
| 152 | +def printInfo(movieinfo): |
| 153 | + for key in movieinfo.keys(): |
| 154 | + print key,movieinfo[key] |
| 155 | +runSpider(1295096,1295096+10000) |
| 156 | +#html ,s,t= get("https://movie.douban.com/subject/1309046") |
| 157 | +#movieinfo = getMovieInfo(html) |
| 158 | +#getproxy() |
0 commit comments