Skip to content

Commit a6c206d

Browse files
committed
merge
2 parents 29e890a + 1446bf4 commit a6c206d

File tree

14 files changed

+312
-299
lines changed

14 files changed

+312
-299
lines changed

图片抓取/游侠客.xml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
<root>
2+
<Doc Name="" Description="" Version="0" SavePath="\\VBOXSVR\dev\Hawk-Projects\图片抓取\游侠客.xml">
3+
<DBConnections />
4+
<Children Name="相册列表" Type="SmartCrawler" URL="http://www.youxiake.net/gallery?f=latest" RootXPath="" IsMultiData="List" URLFilter="" ContentFilter="" Crawler="" CreateTime="2016/6/29 20:08:05" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
5+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="Unknown" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:yxk_saltkey=cTifzflU;expires=Fri, 29-Jul-2016 11:38:04 GMT;path=/;domain=.youxiake.net;httponly,yxk_last_visit=1467196684;PHPSESSID=mc03irmht956tpvvu8afo6d3l3;=&#xA;" />
6+
<Children Name="相册" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/@href[1]" IsHtml="False" />
7+
<Children Name="标题" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/a[1]" IsHtml="False" />
8+
<Children Name="名称" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[2]/div[2]/a[1]" IsHtml="False" />
9+
<Children Name="查看" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/b[1]/a[1]" IsHtml="False" />
10+
<Children Name="喜爱" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[2]/div[2]/b[2]/a[1]" IsHtml="False" />
11+
<Children Name="推荐" XPath="/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/b[3]/a[1]" IsHtml="False" />
12+
</Children>
13+
<Children Name="主流程" Type="SmartETLTool" MaxThreadCount="20" GenerateMode="串行模式" SampleMount="20" CreateTime="2016/6/29 20:08:05" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
14+
<Children Enabled="True" MinValue="1" MaxValue="1503" Interval="1" Column="id" Position="0" MergeType="Append" Type="RangeGE" Group="Generator" />
15+
<Children Enabled="True" MergeWith="" Format="http://www.youxiake.net/gallery?f=latest&amp;p={0}" Column="id" NewColumn="url" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
16+
<Children Enabled="True" MaxTryCount="1" ErrorDelay="3000" IsRegex="False" CrawlerSelector="相册列表" Column="url" NewColumn="" OneOutput="False" IsMultiYield="True" Type="CrawlerTF" Group="Transformer" />
17+
<Children Enabled="True" MergeWith="" Format="http://www.youxiake.net/{0}" Column="相册" NewColumn="url" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
18+
<Children Enabled="True" IsMultiYield="False" Index="0" Script="(-?\d+)(\.\d+)?" NewColumn="" Column="相册" OneOutput="True" Type="NumberTF" Group="Transformer" />
19+
<Children AddTask="True" NewColumn="相册" ETLSelector="相册抓取" Column="相册" Enabled="True" Type="EtlEX" Group="Executor" />
20+
</Children>
21+
<Children Name="图片列表" Type="SmartCrawler" URL="http://www.youxiake.net/album/518572" RootXPath="/html[1]/body[1]/div[3]/div[3]/div" IsMultiData="List" URLFilter="" ContentFilter="" Crawler="" CreateTime="2016/6/29 20:08:05" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
22+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="Unknown" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:yxk_last_visit=1467196961;expires=Fri, 29-Jul-2016 11:42:41 GMT;path=/;domain=.youxiake.net;yxk_saltkey=pf7S9s58;httponly,yxk_last_visit=1467196961;PHPSESSID=6c49h205tslkvs608gn26gj774;=&#xA;" />
23+
<Children Name="大图" XPath="/div[2]/div[1]/img[1]/@data-src[1]" IsHtml="False" />
24+
<Children Name="Exif" XPath="/div[2]/div[2]/div[1]" IsHtml="False" />
25+
<Children Name="rid" XPath="/div[3]/div[1]/a[1]/@id[1]" IsHtml="False" />
26+
</Children>
27+
<Children Name="相册抓取" Type="SmartETLTool" MaxThreadCount="20" GenerateMode="串行模式" SampleMount="20" CreateTime="2016/6/29 20:08:05" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
28+
<Children Enabled="False" Content="518572" Column="相册" Position="0" MergeType="Append" Type="TextGE" Group="Generator" />
29+
<Children Enabled="True" MergeWith="" Format="http://www.youxiake.net/album/{0}" Column="相册" NewColumn="url" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
30+
<Children Enabled="True" MaxTryCount="1" ErrorDelay="3000" IsRegex="False" CrawlerSelector="图片列表" Column="url" NewColumn="相册" OneOutput="False" IsMultiYield="True" Type="CrawlerTF" Group="Transformer" />
31+
<Children Enabled="True" IsMultiYield="False" Index="0" Script="\d+" NewColumn="" Column="rid" OneOutput="True" Type="NumberTF" Group="Transformer" />
32+
<Children Enabled="True" MergeWith="rid" Format="D:\游侠客\{0}\{1}.jpg" Column="相册" NewColumn="save" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
33+
<Children Enabled="True" Column="id" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
34+
<Children Enabled="True" Column="save" NewColumn="跳过" OneOutput="True" IsMultiYield="False" Type="FileExistFT" Group="Transformer" />
35+
<Children Enabled="True" Script="False" Count="1" Revert="False" Column="跳过" IsDebugFilter="True" Type="RegexFT" Group="Filter" />
36+
<Children Enabled="True" SavePath="[save]" IsAsync="False" Column="大图" Type="SaveFileEX" Group="Executor" />
37+
<Children Enabled="True" Column="跳过" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
38+
<Children Enabled="True" Column="save" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
39+
<Children Enabled="False" ExecuteType="OnlyInsert" Column="大图" Type="DbEX" Group="Executor" />
40+
</Children>
41+
</Doc>
42+
</root>

图片抓取/蚂蜂窝.xml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
<root>
2+
<Doc Name="" Description="" Version="0" SavePath="\\VBOXSVR\dev\Hawk-Projects\图片抓取\蚂蜂窝.xml">
3+
<DBConnections>
4+
<Children DBName="ant" Name="MongoDB" TypeName="MongoDBConnector" ConnectString="mongodb://10.101.167.107" AutoConnect="True" />
5+
</DBConnections>
6+
<Children Name="瀑布流列表" Type="SmartCrawler" URL="http://www.mafengwo.cn/mdd/ajax_photolist.php?act=getMddPhotoList&amp;mddid=10130&amp;page=4" RootXPath="" IsMultiData="List" URLFilter="http://www.mafengwo.cn/photo/mdd/10130.html" ContentFilter="七天七夜的旅程" Crawler="" CreateTime="2016/6/29 18:30:39" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
7+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="UTF8" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222016-06-29+17%3A03%3A00%22%3B%7D;expires=Wed, 06-Jul-2016 09:02:09 GMT;path=/;domain=.mafengwo.cn;mfw_uuid=57738ec3-5289-7498-78a2-2a709b667b82;PHPSESSID=2kgmpmco0l8pma69k042cdqea3;HttpOnly,mfw_uuid=57738e91-47a4-d4d8-0012-93e6801288ef;=&#xA;" />
8+
<Children Name="url" XPath="/ul[1]/li[1]/a[1]/@href[1]" IsHtml="False" />
9+
<Children Name="喜欢" XPath="/ul[1]/li[1]/span[1]" IsHtml="False" />
10+
<Children Name="作者" XPath="/ul[2]/li[1]/div[1]/p[1]/a[1]" IsHtml="False" />
11+
<Children Name="标题" XPath="/ul[1]/li[1]/div[1]/p[2]/a[1]" IsHtml="False" />
12+
</Children>
13+
<Children Name="马蜂窝相册" Type="SmartETLTool" MaxThreadCount="20" GenerateMode="串行模式" SampleMount="20" CreateTime="2016/6/29 18:30:39" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
14+
<Children Enabled="True" Content="http://www.mafengwo.cn/localdeals/" Column="url" Position="0" MergeType="Append" Type="TextGE" Group="Generator" />
15+
<Children Enabled="True" MaxTryCount="1" ErrorDelay="3000" IsRegex="False" SetPrefex="" CrawlerSelector="网页采集器" Column="url" NewColumn="" OneOutput="False" IsMultiYield="False" Type="CrawlerTF" Group="Transformer" />
16+
<Children XPath="//dd//a" IsManyData="True" GetText="False" GetCount="False" IsInsertNull="False" Column="Content" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="True" Type="XPathTF" Group="Transformer" />
17+
<Children Column="HTML" NewColumn="" Enabled="True" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
18+
<Children FromBack="False" ShouldSplitChars="False" SplitPause="True" SplitNull="True" Index="0" SplitChar="" Column="Text" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="False" Type="SplitTF" Group="Transformer" />
19+
<Children FromBack="False" ShouldSplitChars="False" SplitPause="False" SplitNull="True" Index="1" SplitChar="&quot;" Column="OHTML" NewColumn="" Enabled="True" OneOutput="True" IsMultiYield="False" Type="SplitTF" Group="Transformer" />
20+
<Children Enabled="False" Script="三亚" Count="1" Revert="False" Column="Text" IsDebugFilter="True" Type="RegexFT" Group="Filter" />
21+
<Children Enabled="True" IsMultiYield="False" Index="2" Script="(-?\d+)(\.\d+)?" NewColumn="" Column="OHTML" OneOutput="True" Type="NumberTF" Group="Transformer" />
22+
<Children Enabled="True" Revert="False" Column="Text" IsDebugFilter="True" Type="NullFT" Group="Filter" />
23+
<Children Revert="False" Column="Text" Enabled="True" IsDebugFilter="True" Type="RepeatFT" Group="Filter" />
24+
<Children Enabled="True" IDColumn="[Text]" MountColumn="1000" DisplayProgress="True" Column="Text" NewColumn="" OneOutput="True" IsMultiYield="True" Type="ToListTF" Group="Transformer" />
25+
<Children Enabled="True" MergeWith="" Format="http://www.mafengwo.cn/photo/mdd/{0}.html" Column="OHTML" NewColumn="url" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
26+
<Children Enabled="True" Revert="False" Column="Text" IsDebugFilter="True" Type="NullFT" Group="Filter" />
27+
<Children Enabled="True" Column="Text" NewColumn="城市" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
28+
<Children Enabled="True" Column="OHTML" NewColumn="id" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
29+
<Children Enabled="True" MinValue="1" MaxValue="80" Interval="1" Column="page" Position="0" MergeType="Cross" Type="RangeGE" Group="Generator" />
30+
<Children Enabled="True" MergeWith="page" Format="http://www.mafengwo.cn/mdd/ajax_photolist.php?act=getMddPhotoList&amp;mddid={0}&amp;page={1}" Column="id" NewColumn="purl" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
31+
<Children Enabled="True" MaxTryCount="1" ErrorDelay="3000" IsRegex="False" SetPrefex="" CrawlerSelector="瀑布流列表" Column="purl" NewColumn="id 城市" OneOutput="False" IsMultiYield="True" Type="CrawlerTF" Group="Transformer" />
32+
<Children Enabled="True" MergeWith="" Format="http://www.mafengwo.cn/{0}" Column="url" NewColumn="" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
33+
<Children Enabled="True" MaxTryCount="1" ErrorDelay="3000" IsRegex="False" SetPrefex="" CrawlerSelector="网页采集器" Column="url" NewColumn="" OneOutput="False" IsMultiYield="False" Type="CrawlerTF" Group="Transformer" />
34+
<Children Enabled="True" Former="_j_stageimg&quot; src=&quot;" End="data-imgi" HaveStartEnd="False" Column="Content" NewColumn="" OneOutput="True" IsMultiYield="False" Type="StrExtractTF" Group="Transformer" />
35+
<Children Enabled="True" Column="Content" NewColumn="大图" OneOutput="False" IsMultiYield="False" Type="RenameTF" Group="Transformer" />
36+
<Children Enabled="True" IsMultiYield="False" Index="1" Script="(-?\d+)(\.\d+)?" NewColumn="uid" Column="url" OneOutput="True" Type="NumberTF" Group="Transformer" />
37+
<Children Enabled="True" MergeWith="城市" Format="Z:\picture\蚂蜂窝\{1}\{0}.jpg" Column="uid" NewColumn="保存" OneOutput="True" IsMultiYield="False" Type="MergeTF" Group="Transformer" />
38+
<Children Enabled="True" Column="保存" NewColumn="是否跳过" OneOutput="True" IsMultiYield="False" Type="FileExistFT" Group="Transformer" />
39+
<Children Enabled="True" Script="False" Count="1" Revert="False" Column="是否跳过" IsDebugFilter="False" Type="RegexFT" Group="Filter" />
40+
<Children Enabled="True" SavePath="[保存]" IsAsync="False" Column="大图" Type="SaveFileEX" Group="Executor" />
41+
<Children Enabled="True" Column="保存" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
42+
<Children Enabled="True" Column="是否跳过" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
43+
<Children Enabled="True" Column="url" NewColumn="" OneOutput="False" IsMultiYield="False" Type="DeleteTF" Group="Transformer" />
44+
<Children Enabled="True" ExecuteType="OnlyInsert" TableName="mafengwo" Column="uid" Type="DbEX" Group="Executor" Connector="MongoDB" />
45+
</Children>
46+
<Children Name="网页采集器" Type="SmartCrawler" URL="" RootXPath="" IsMultiData="One" URLFilter="" ContentFilter="" Crawler="" CreateTime="2016/6/29 18:30:39" Description="任务描述" ScriptPath="">
47+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="Unknown" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222016-06-29+19%3A03%3A42%22%3B%7D;expires=Wed, 06-Jul-2016 09:55:09 GMT;path=/;domain=.mafengwo.cn;mfw_uuid=5773a3d3-41d6-2243-aedc-d0b9557532d8;PHPSESSID=36p6td97hrb0emhlcaqm3lpak0;HttpOnly,mfw_uuid=57739afd-3c9c-d6ec-555d-9e7dc0d8942a;=&#xA;" />
48+
</Children>
49+
<Children Name="照片详情" Type="SmartCrawler" URL="http://www.mafengwo.cn/photo/mdd/10065_25459190.html" RootXPath="" IsMultiData="One" URLFilter="" ContentFilter="" Crawler="" CreateTime="2016/6/29 19:12:08" Description="任务描述" ScriptPath="" Children="System.Collections.Generic.List`1[Hawk.Core.Utils.Plugins.FreeDocument]">
50+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="Unknown" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222016-06-29+19%3A08%3A32%22%3B%7D;expires=Wed, 06-Jul-2016 11:05:06 GMT;path=/;domain=.mafengwo.cn;mfw_uuid=5773ac2f-e646-1173-2b62-e7d5059ae5bb;PHPSESSID=g6k0e8gllr58su5tjbpha509o1;HttpOnly,mfw_uuid=5773ab62-d523-4d22-ce17-6c3524c9af54;=&#xA;" />
51+
<Children Name="大图" XPath="/html[1]/body[1]/div[2]/div[3]/div[1]/div[1]/img[1]/@src[1]" IsHtml="False" />
52+
<Children Name="位置" XPath="/html[1]/body[1]/div[2]/div[3]/div[2]/ul[1]/li[2]/p[1]/a[1]" IsHtml="False" />
53+
</Children>
54+
<Children Name="瀑布流列表2" Type="SmartCrawler" URL="http://www.mafengwo.cn/mdd/ajax_photolist.php?act=getMddPhotoList&amp;mddid=10130&amp;page=4" RootXPath="/html/body/ul" IsMultiData="List" URLFilter="http://www.mafengwo.cn/photo/mdd/10130.html" ContentFilter="七天七夜的旅程" Crawler="" CreateTime="2016/7/4 11:39:49" Description="任务描述" ScriptPath="">
55+
<HttpSet URL="http://www.cnblogs.com/" Allowautoredirect="True" Postdata="" Encoding="UTF8" Method="GET" Parameters="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36&#xA;Cookie:oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222016-06-29+17%3A03%3A00%22%3B%7D;expires=Wed, 06-Jul-2016 09:02:09 GMT;path=/;domain=.mafengwo.cn;mfw_uuid=57738ec3-5289-7498-78a2-2a709b667b82;PHPSESSID=2kgmpmco0l8pma69k042cdqea3;HttpOnly,mfw_uuid=57738e91-47a4-d4d8-0012-93e6801288ef;=&#xA;" />
56+
<Children Name="url" XPath="/li[1]/a[1]/@href[1]" IsHtml="False" />
57+
<Children Name="喜欢" XPath="/li[1]/span[1]" IsHtml="False" />
58+
<Children Name="作者" XPath="/li[1]/div[1]/p[1]/a[1]" IsHtml="False" />
59+
<Children Name="标题" XPath="/li[1]/div[1]/p[2]/a[1]" IsHtml="False" />
60+
</Children>
61+
</Doc>
62+
</root>

0 commit comments

Comments
 (0)