xinlianghu
diff --git a/‎.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions b/‎.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.idea/spider.iml‎
Lines changed: 8 additions & 0 deletions b/‎.idea/spider.iml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.idea/workspace.xml‎
Lines changed: 48 additions & 0 deletions b/‎.idea/workspace.xml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎moveinfo.py‎
Lines changed: 144 additions & 0 deletions b/‎moveinfo.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎moveinfo.pyc‎
4.46 KB b/‎moveinfo.pyc‎
4.46 KB
diff --git a/‎movie.txt‎
Lines changed: 1 addition & 0 deletions b/‎movie.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spider.py‎
Lines changed: 158 additions & 0 deletions b/‎spider.py‎
Lines changed: 158 additions & 0 deletions
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# coding=utf-8
+import sys
+import MySQLdb
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+host = 'localhost'
+user = 'root'
+password = 'admin'
+database = 'test'
+
+class InfoError(Exception):
+    def __init__(self,value):
+        self.value = value
+
+class BasicInfo(object):
+    def __init__(self,info ):
+        self.id = 0
+        self.name = ''
+        self.director = ''
+        self.country = ''
+        self.tscore = '0'
+        self.staring = ''
+        self.language = ''
+        self.type = ''
+        self.alias = ''
+        self.year = ''
+        self.num = '0'
+        try:
+            self.id = info['index']
+            self.name = info['名字']
+            self.director = info['导演']
+            self.country = info['制片国家/地区']
+            self.tscore = info['总评分']
+            self.staring = info['主演']
+            self.language = info['语言']
+            self.year = info['出品时间']
+            self.type = info['类型']
+            self.alias = info['又名']
+            self.num = info['评分人数']
+        except:
+            pass
+            #raise InfoError(e.message +'信息格式错误,无法正确解析')
+
+    def __str__(self):
+        info = ""
+        info += "索引:%s\n" % self.id
+        info += "名字:%s\n" % self.name
+        info += "导演:%s\n" % self.director
+        info += "主演:%s\n" % self.staring
+        info += "总评分:%s分\n" % self.tscore
+        info += "评分人数:%s\n" % self.num
+        info += "制片国家/地区:%s\n" % self.country
+        info += "语言:%s\n" % self.language
+        info += "类型:%s\n" % self.type
+        info += "出品时间:%s\n" % self.year
+        info += "又名:%s\n" % self.alias
+        return info
+
+def init_database():
+        conn = MySQLdb.connect(host,user,password,database,charset = 'utf8')
+        cursor = conn.cursor()
+        return conn,cursor
+
+def create_table(db,cursor):
+    CREATE = '''
+    CREATE TABLE IF NOT EXISTS movie_info (
+    id INT  NOT NULL AUTO_INCREMENT,
+	name VARCHAR(256) NOT NULL,
+	director VARCHAR(256) NOT NULL,
+	country VARCHAR(256) NOT NULL,
+	tscore VARCHAR(10) NOT NULL,
+	num VARCHAR(20) NOT NULL,
+	staring VARCHAR(512) NOT NULL,
+	language VARCHAR(40) NOT NULL,
+	year VARCHAR(20) NOT NULL,
+	type VARCHAR(256) NOT NULL,
+	alias VARCHAR(256) NOT NULL,
+	PRIMARY KEY(id)
+	) default charset=utf8;
+    '''
+    try:
+        cursor.execute(CREATE)
+        db.commit()
+    except Exception as e:
+        print e
+        db.rollback()
+def save(db,cursor,movieInfo):
+    filed = movieInfo.__dict__
+    keyList = filed.keys()
+    key = ''
+    for i in keyList:
+        key += str(i) + ','
+    key = key[:-1]
+    value = ''
+    valueList = filed.values()
+    for i in valueList:
+        if isinstance(i,int):
+            value +=  str(i)  + ','
+        if isinstance(i,str):
+            value +=  '\"' + i.strip() + '\"' + ','
+    value = value[:-1]
+    INSERT = "INSERT INTO movie_info ( %s ) VALUES( %s )" %(key,value)
+    print INSERT
+    try:
+        cursor.execute(INSERT)
+        db.commit()
+        print "insert succes!"
+    except Exception as e:
+        print e
+        db.rollback()
+'''
+def test():
+    db,cursor = init_database()
+    #create_table(db,cursor)
+    #save(cursor,BasicInfo(None))
+    cursor.execute("select * from movie_info")
+    f = cursor.fetchall()
+    for i in f:
+        print f
+    db.close()
+test()
+'''
+
+import urllib2
+import zlib
+def getProxy():
+    httpproxy_handler = urllib2.ProxyHandler({"anonymous": "61.135.217.7:8080"})
+    opener = urllib2.build_opener(httpproxy_handler)
+    request = urllib2.Request("http://www.baidu.com/")
+
+    # 使用opener.open()方法发送请求才使用自定义的代理，而urlopen()则不使用自定义代理。
+    response = opener.open(request)
+    html = response.read()
+    # 就是将opener应用到全局，之后所有的，不管是opener.open()还是urlopen() 发送请求，都将使用自定义代理。
+    # urllib2.install_opener(opener)
+    # response = urlopen(request)
+    if response.headers['Content-Encoding'] == 'gzip':
+        html = zlib.decompress(html,16+zlib.MAX_WBITS)
+    print html
+
+#getProxy()
+
@@ -0,0 +1 @@
+1291000
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+import urllib2
+import zlib
+from bs4 import BeautifulSoup
+import lxml
+import re
+import sys
+import time
+import random
+import math
+from moveinfo import *
+reload(sys)
+sys.setdefaultencoding('utf-8')
+req_headers = {
+            'Host': 'movie.douban.com',
+            'User-Agent': 'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:58.0) Gecko/20100101 Firefox/58.0',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'zh,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
+            'Accept-Encoding': 'gzip,deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+            }
+
+def get(proxy,url):
+    url = url
+    httpproxy_handler = urllib2.ProxyHandler(proxy)
+    opener = urllib2.build_opener(httpproxy_handler)
+    request = urllib2.Request(url, headers=req_headers)
+    response = None
+    try:
+        response = opener.open(request)
+    except urllib2.URLError as e:
+        errorcode = None
+        reason = None
+        if hasattr(e, 'code'):
+            errorcode = e.code
+        if hasattr(e, 'reason'):
+            reason = e.reason
+        return None,errorcode,reason
+    except Exception as e:
+        print e
+        return None, None, None
+    html = response.read()
+    if response.headers['Content-Encoding'] == 'gzip':
+        html = zlib.decompress(html,16+zlib.MAX_WBITS)
+    return html,None,None
+def getMovieInfo(html):
+    movieInfo = {}
+    soup = BeautifulSoup(html,'lxml')
+    name = soup.find('span',  property="v:itemreviewed").text
+    movieInfo['名字'] = str(name)
+
+    year = soup.find('span',class_="year").text[1:-1]
+    movieInfo['出品时间'] = str(year)
+    info = soup.find('div',id='info').get_text()
+    infolines = str(info).replace(':\n',':').lstrip().splitlines()
+    for line in infolines:
+        lineSplit = line.split(':')
+        lineValue = ''
+        for i in range(1,len(lineSplit)):
+            lineValue += lineSplit[i]
+        movieInfo[lineSplit[0]] = lineValue
+    if soup.find('span' ,property="v:votes") != None:
+        score = str(soup.find('strong' ,class_='ll rating_num').text)
+        ratingNum = str(soup.find('span' ,property="v:votes").text)
+        rating = [0] * 5
+        parent = soup.find('div', class_="ratings-on-weight")
+        items = parent.find_all('div', class_='item')
+        index = 4
+        for item in items:
+            span = item.find_all('span')
+            rating[index] = str(span[1].text.strip())
+            index -= 1
+        movieInfo['评分'] = rating
+    else:
+        score = '0.0'
+        ratingNum = '0'
+    movieInfo['总评分'] = score
+    movieInfo['评分人数'] = ratingNum
+    return movieInfo
+def runSpider(start,end,size = 10):
+    db,cursor = init_database()
+    create_table(db,cursor)
+    url = "https://movie.douban.com/subject/"
+    items = 0
+    count = start-1
+    counter = 0
+    proxy = selectProxy()
+    #proxy = {'https':'222.73.68.144:8090'}
+    print 'select proxy:', proxy
+    old_proxy = proxy
+    index = start
+    while index <= end:
+        tempUrl = url + str(index)
+        movieinfo = None
+        if counter >= 1000:
+            counter = 0
+            while old_proxy == proxy:
+                proxy = selectProxy()
+            old_proxy = proxy
+            print 'select proxy:',proxy
+        html ,errorcode,reason= get(proxy,tempUrl)
+
+        counter +=1
+        if str(errorcode) == '302':
+            db.close()
+            print 'get the last html index:%d'%index
+            return
+        if html == None:
+            print 'Get HTML from %s failed! ' % tempUrl,'Error code:',errorcode,'reason:',reason
+            if errorcode == None:
+                proxy = selectProxy()
+                print 'select proxy:', proxy
+            continue
+        print 'Get HTML from %s success!'% tempUrl
+        try:
+            movieinfo = getMovieInfo(html)
+            movieinfo['index'] = index
+            print BasicInfo(movieinfo)
+            save(db,cursor,BasicInfo(movieinfo))
+            index +=1
+        except Exception as e:
+            print e
+            continue
+    db.close()
+
+def getproxy():
+    url = 'http://www.xicidaili.com/wn'
+    header = req_headers
+    header['Host'] ='www.xicidaili.com'
+    request = urllib2.Request(url ,headers=header)
+    response = urllib2.urlopen(request)
+    html = response.read()
+    if response.headers['Content-Encoding'] == 'gzip':
+        html = zlib.decompress(html,16+zlib.MAX_WBITS)
+    soup = BeautifulSoup(html, 'lxml')
+    trs = soup.find('table',id="ip_list").find_all('tr')
+    num = len(trs)
+    proxy = []
+    for i in range(1,num):
+        tds = trs[i].find_all('td')
+        if str(tds[4].get_text()) == '高匿':
+            proxy.append({str(tds[5].get_text()).lower():str(tds[1].get_text()+':'+tds[2].get_text())})
+    return proxy
+
+def selectProxy():
+    proxy = getproxy()
+    num = len(proxy)
+    index = int(random.uniform(0,num))
+    return proxy[index]
+
+def printInfo(movieinfo):
+    for key in movieinfo.keys():
+        print key,movieinfo[key]
+runSpider(1295096,1295096+10000)
+#html ,s,t= get("https://movie.douban.com/subject/1309046")
+#movieinfo = getMovieInfo(html)
+#getproxy()