-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtaonvlang.py
More file actions
68 lines (63 loc) · 2.03 KB
/
taonvlang.py
File metadata and controls
68 lines (63 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os
import urllib
#获取整个页面的html(使用PhantomJS+webdriver解析)
PDriver0 = webdriver.PhantomJS(executable_path = 'C:/Users/lyzdsb/Desktop/phantomjs-2.1.1-windows/bin/phantomjs')
PDriver0.get('https://mm.taobao.com/search_tstar_model.htm?')
soup = BeautifulSoup(PDriver0.page_source , 'lxml')
#print(soup.prettify()) 格式化输出html,check源html
#获取html后,用bs4获取相对应的url
#获取封面照片
cover_imgs = soup.find_all("img", { 'data-ks-lazyload' : True })
#for img in cover_imgs:
#print(img.get())
'''
#坑1:得到img标签后,无法得到attribute:data-ks-lazyload
'''
#获取个人链接
def perUrl_List():
Urls = soup.find_all("a",href = re.compile('\/\/.*userId=(\d*)'))
PDriver1 = webdriver.PhantomJS(executable_path = 'C:/Users/lyzdsb/Desktop/phantomjs-2.1.1-windows/bin/phantomjs')
perUrls = []
for per in Urls:
perUrl = PDriver1.get(per.get('href'))
perUrls.append(perUrl)
#print(perUrls)
return perUrls
#获取个人页面的html源码
Urls = soup.find_all("a", href=re.compile('\/\/.*userId=(\d*)'))
url = Urls[""]
#获取个人文本信息
perInfos = PDriver0.find_element_by_id('J_GirlsList').text
print(perInfos)
#创建文件夹
def mkdir(path):
isExists = os.path.exists(path)
if not isExists:
#创建
print("创建目录为"+path+"的文件夹")
os.mkdir(path)
return True
if isExists:
print(path+"已存在")
return False
#保存封面图片到本地
url = "https:"+str(cover_imgs)
#open,read(data),write,close,flush
html = urlopen(url)
data = html.read()
Filename = "C:/Users/lyzdsb/Desktop/img"
File =Filename+str(number)+".jpg"
f = open(File,"wb") #write in binary
try:
f.write(data)
finally:
f.close()
f.flush()
#保存个人主页的艺术照
perUrrl = "https:"+str(perUrl)
html_per = urlopen(perUrrl)
data = html.read()
Filename =