Skip to content

Commit ce7294d

Browse files
author
Akash Mahanty
committed
Implemented new feature, known urls for domain.
1 parent c9fa114 commit ce7294d

File tree

2 files changed

+126
-17
lines changed

2 files changed

+126
-17
lines changed

waybackpy/cli.py

Lines changed: 77 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import print_function
33
import sys
4+
import os
5+
import re
46
import argparse
57
from waybackpy.wrapper import Url
68
from waybackpy.__version__ import __version__
@@ -31,6 +33,36 @@ def _near(obj, args):
3133
_near_args["minute"] = args.minute
3234
return (obj.near(**_near_args))
3335

36+
def _known_urls(obj, args):
37+
sd = False
38+
al = False
39+
if args.subdomain:
40+
sd = True
41+
if args.alive:
42+
al = True
43+
url_list = obj.known_urls(alive=al, subdomain=sd)
44+
total_urls = len(url_list)
45+
46+
if total_urls > 0:
47+
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0])
48+
if m:
49+
domain = m.group(1)
50+
else:
51+
domain = "waybackpy-known"
52+
dir_path = os.path.abspath(os.getcwd())
53+
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
54+
text = "\n".join(url_list) + "\n"
55+
with open(file_name, "a+") as f:
56+
f.write(text)
57+
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
58+
total_urls, domain, total_urls
59+
)
60+
61+
else:
62+
text = "No known URLs found. Please try a diffrent domain!"
63+
64+
return text
65+
3466
def _get(obj, args):
3567
if args.get.lower() == "url":
3668
return (obj.get())
@@ -52,10 +84,10 @@ def _get(obj, args):
5284

5385
def args_handler(args):
5486
if args.version:
55-
return (__version__)
87+
return ("waybackpy version %s" % __version__)
5688

5789
if not args.url:
58-
return ("Specify an URL. See --help for help using waybackpy.")
90+
return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__)
5991

6092
if args.user_agent:
6193
obj = Url(args.url, args.user_agent)
@@ -72,26 +104,54 @@ def args_handler(args):
72104
return _total_archives(obj)
73105
if args.near:
74106
return _near(obj, args)
107+
if args.known_urls:
108+
return _known_urls(obj, args)
75109
if args.get:
76110
return _get(obj, args)
77-
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
111+
return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.")
78112

79113
def parse_args(argv):
80114
parser = argparse.ArgumentParser()
81-
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
82-
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
83-
parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.")
84-
parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
85-
parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.")
86-
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
87-
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
88-
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
89-
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
90-
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
91-
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
92-
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
93-
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
94-
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")
115+
116+
requiredArgs = parser.add_argument_group('URL argument (required)')
117+
requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur")
118+
119+
userAgentArg = parser.add_argument_group('User Agent')
120+
userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
121+
122+
saveArg = parser.add_argument_group("Create new archive/save URL")
123+
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
124+
125+
oldestArg = parser.add_argument_group("Oldest archive")
126+
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
127+
128+
newestArg = parser.add_argument_group("Newest archive")
129+
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
130+
131+
totalArg = parser.add_argument_group("Total number of archives")
132+
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
133+
134+
getArg = parser.add_argument_group("Get source code")
135+
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
136+
137+
knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.")
138+
knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.")
139+
knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.")
140+
knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.")
141+
142+
143+
nearArg = parser.add_argument_group('Archive close to time specified')
144+
nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time")
145+
146+
nearArgs = parser.add_argument_group('Arguments that are used only with --near')
147+
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
148+
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
149+
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
150+
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
151+
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
152+
153+
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
154+
95155
return parser.parse_args(argv[1:])
96156

97157
def main(argv=None):

waybackpy/wrapper.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,10 @@ def get(self, url="", user_agent="", encoding=""):
100100
"""Return the source code of the supplied URL.
101101
If encoding is not supplied, it is auto-detected from the response.
102102
"""
103+
103104
if not url:
104105
url = self._clean_url()
106+
105107
if not user_agent:
106108
user_agent = self.user_agent
107109

@@ -173,3 +175,50 @@ def total_archives(self):
173175
response = _get_response(req)
174176
# Most efficient method to count number of archives (yet)
175177
return str(response.read()).count(",")
178+
179+
def known_urls(self, alive=False, subdomain=False):
180+
"""Returns list of URLs known to exist for given domain name
181+
because these URLs were crawled by WayBack Machine bots.
182+
183+
Useful for pen-testers and others.
184+
185+
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
186+
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
187+
"""
188+
189+
url_list = []
190+
191+
if subdomain:
192+
request_url = (
193+
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
194+
% self._clean_url()
195+
)
196+
197+
else:
198+
request_url = (
199+
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
200+
% self._clean_url()
201+
)
202+
203+
hdr = {"User-Agent": "%s" % self.user_agent}
204+
req = Request(request_url, headers=hdr) # nosec
205+
response = _get_response(req)
206+
207+
data = json.loads(response.read().decode("UTF-8"))
208+
url_list = [y[0] for y in data if y[0] != "original"]
209+
210+
#Remove all deadURLs from url_list if alive=True
211+
if alive:
212+
tmp_url_list = []
213+
for url in url_list:
214+
215+
try:
216+
urlopen(url)
217+
except:
218+
continue
219+
220+
tmp_url_list.append(url)
221+
222+
url_list = tmp_url_list
223+
224+
return url_list

0 commit comments

Comments
 (0)