Skip to content

Commit 55b49da

Browse files
committed
cannon: more reasonable ordering for query elements
1 parent 2ca6246 commit 55b49da

File tree

2 files changed

+39
-38
lines changed

2 files changed

+39
-38
lines changed

src/promnesia/cannon.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
"""
1212
# TODO eh?? they fixed mobile.twitter.com?
1313

14+
from itertools import chain
1415
import re
1516
import typing
16-
from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any
17+
from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
1718

1819
import urllib.parse
1920
from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, SplitResult
@@ -89,44 +90,42 @@ def canonify_domain(dom: str):
8990
'utf8',
9091
}
9192

92-
default_qkeep = {
93+
default_qkeep = [
9394
# ok, various BBS have it, hackernews has it etc?
9495
# hopefully it's a reasonable one to keep..
9596
'id',
9697

97-
# common to some sites..
98+
# common to forums, usually means 'thread'
99+
't',
100+
101+
# common to some sites.., usually 'post'
98102
'p',
99-
}
103+
]
100104

101105
# TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
102106
class Spec(NamedTuple):
103-
qkeep : Optional[Set[str]] = None
107+
qkeep : Optional[Collection[str]] = None
104108
qremove: Optional[Set[str]] = None
105109
fkeep : bool = False
106110

107-
def keep_query(self, q: str) -> bool:
108-
qkeep = default_qkeep .union(self.qkeep or {})
111+
def keep_query(self, q: str) -> Optional[int]: # returns order
112+
qkeep = {
113+
q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))
114+
}
109115
qremove = default_qremove.union(self.qremove or {})
110116
# I suppose 'remove' is only useful for logging. we remove by default anyway
111117

112118
keep = False
113119
remove = False
114-
if qkeep is not None and q in qkeep:
115-
keep = True
116-
# pylint: disable=unsupported-membership-test
120+
qk = qkeep.get(q)
121+
if qk is not None:
122+
return qk
123+
# todo later, check if spec tells both to keep and remove?
117124
if q in qremove:
118-
remove = True
119-
if keep and remove:
120-
# keep wins?
121-
return True # TODO need a warning
122-
if keep:
123-
return True
124-
if remove:
125-
return False
126-
125+
return None
127126
# by default drop all
128127
# it's a better default, since if it's *too* unified, the user would notice it. but not vice versa!
129-
return False
128+
return None
130129

131130
@classmethod
132131
def make(cls, **kwargs):
@@ -138,14 +137,13 @@ def make(cls, **kwargs):
138137
specs = {
139138
'youtube.com': S(
140139
# TODO search_query?
141-
qkeep={
142-
'list', # TODO hmm. list is kinda important for playlist urls...
140+
qkeep=[ # note: experimental.. order matters here
143141
'v',
144-
}, # TODO frozenset?
142+
't',
143+
'list',
144+
],
145+
# todo frozenset?
145146
qremove={
146-
't', # TODO not so sure about it
147-
148-
149147
'time_continue', 'index', 'feature', 'lc', 'app', 'start_radio', 'pbjreload', 'annotation_id',
150148
'flow', 'sort', 'view',
151149
'enablejsapi', 'wmode', 'html5', 'autoplay', 'ar',
@@ -181,10 +179,9 @@ def make(cls, **kwargs):
181179
'wikipedia.org': S(fkeep=True),
182180
'scottaaronson.com' : S(qkeep={'p'}, fkeep=True),
183181
'urbandictionary.com': S(qkeep={'term'}),
184-
'ycombinator.com' : S(qkeep={'id'}),
182+
'ycombinator.com' : S(qkeep={'id'}), # todo just keep id by default?
185183
'play.google.com' : S(qkeep={'id'}),
186184
'answers.yahoo.com' : S(qkeep={'qid'}),
187-
'ubuntuforums.org' : S(qkeep={'t'}),
188185
}
189186

190187
_def_spec = S()
@@ -410,8 +407,12 @@ def canonify(url: str) -> str:
410407
# frag = parts.fragment if spec.fkeep else ''
411408
frag = ''
412409

413-
qq = [(k, v) for k, v in qq if spec.keep_query(k)]
414-
qq = list(sorted(qq)) # order matters in theory, but definitely not by default
410+
iqq = []
411+
for k, v in qq:
412+
order = spec.keep_query(k)
413+
if order is not None:
414+
iqq.append((order, k, v))
415+
qq = [(k, v) for i, k, v in sorted(iqq)]
415416
# TODO still not sure what we should do..
416417
# quote_plus replaces %20 with +, not sure if we want it...
417418
query = urlencode(qq, quote_via=quote_via) # type: ignore[type-var]

tests/cannon.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ def check(url, expected):
2828
# then could align URLs etc?
2929

3030
@param('url,expected', [(
31-
'https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s',
32-
# TODO not so sure about &t, it's sort of useful
33-
'youtube.com/watch?list=WL&v=1NHbPN9pNPM'
31+
'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
32+
# NOTE: t= reordered, makes it more hierarchical
33+
# list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
34+
'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
3435
), (
3536
'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
3637
'youtube.com/watch?v=wHrCkyoe72U'
@@ -39,8 +40,7 @@ def check(url, expected):
3940
'youtube.com/watch?v=nyc6RJEEe0U'
4041
), (
4142
'https://youtu.be/iCvmsMzlF7o?list=WL',
42-
# TODO hmm. ordering?
43-
'youtube.com/watch?list=WL&v=iCvmsMzlF7o'
43+
'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
4444
),
4545
# TODO can even be like that or contain timestamp (&t=)
4646
# TODO warn if param already present? shouldn't happen..
@@ -196,10 +196,10 @@ def test_reddit(url, expected):
196196
197197
# should sort query params
198198
( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
199-
, 'youtube.com/watch?list=WL&v=hvoQiF0kBI8',
199+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
200200
),
201201
( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
202-
, 'youtube.com/watch?list=WL&v=hvoQiF0kBI8',
202+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
203203
),
204204
205205
# TODO def need to allow the _user_ to define the rules.
@@ -211,7 +211,7 @@ def test_reddit(url, expected):
211211
),
212212
213213
( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
214-
, 'ubuntuforums.org/showthread.php?p=8806195&t=1403470',
214+
, 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
215215
),
216216
217217
( 'https://arstechnica.com/?p=1371299',

0 commit comments

Comments
 (0)