1111"""
1212# TODO eh?? they fixed mobile.twitter.com?
1313
14+ from itertools import chain
1415import re
1516import typing
16- from typing import Iterable , NamedTuple , Set , Optional , List , Sequence , Union , Tuple , Dict , Any
17+ from typing import Iterable , NamedTuple , Set , Optional , List , Sequence , Union , Tuple , Dict , Any , Collection
1718
1819import urllib .parse
1920from urllib .parse import urlsplit , parse_qsl , urlunsplit , parse_qs , urlencode , SplitResult
@@ -89,44 +90,42 @@ def canonify_domain(dom: str):
8990 'utf8' ,
9091}
9192
92- default_qkeep = {
93+ default_qkeep = [
9394 # ok, various BBS have it, hackernews has it etc?
9495 # hopefully it's a reasonable one to keep..
9596 'id' ,
9697
97- # common to some sites..
98+ # common to forums, usually means 'thread'
99+ 't' ,
100+
101+ # common to some sites.., usually 'post'
98102 'p' ,
99- }
103+ ]
100104
101105# TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
102106class Spec (NamedTuple ):
103- qkeep : Optional [Set [str ]] = None
107+ qkeep : Optional [Collection [str ]] = None
104108 qremove : Optional [Set [str ]] = None
105109 fkeep : bool = False
106110
107- def keep_query (self , q : str ) -> bool :
108- qkeep = default_qkeep .union (self .qkeep or {})
111+ def keep_query (self , q : str ) -> Optional [int ]: # returns order
112+ qkeep = {
113+ q : i for i , q in enumerate (chain (default_qkeep , self .qkeep or []))
114+ }
109115 qremove = default_qremove .union (self .qremove or {})
110116 # I suppose 'remove' is only useful for logging. we remove by default anyway
111117
112118 keep = False
113119 remove = False
114- if qkeep is not None and q in qkeep :
115- keep = True
116- # pylint: disable=unsupported-membership-test
120+ qk = qkeep .get (q )
121+ if qk is not None :
122+ return qk
123+ # todo later, check if spec tells both to keep and remove?
117124 if q in qremove :
118- remove = True
119- if keep and remove :
120- # keep wins?
121- return True # TODO need a warning
122- if keep :
123- return True
124- if remove :
125- return False
126-
125+ return None
127126 # by default drop all
128127 # it's a better default, since if it's *too* unified, the user would notice it. but not vice versa!
129- return False
128+ return None
130129
131130 @classmethod
132131 def make (cls , ** kwargs ):
@@ -138,14 +137,13 @@ def make(cls, **kwargs):
138137specs = {
139138 'youtube.com' : S (
140139 # TODO search_query?
141- qkeep = {
142- 'list' , # TODO hmm. list is kinda important for playlist urls...
140+ qkeep = [ # note: experimental.. order matters here
143141 'v' ,
144- }, # TODO frozenset?
142+ 't' ,
143+ 'list' ,
144+ ],
145+ # todo frozenset?
145146 qremove = {
146- 't' , # TODO not so sure about it
147-
148-
149147 'time_continue' , 'index' , 'feature' , 'lc' , 'app' , 'start_radio' , 'pbjreload' , 'annotation_id' ,
150148 'flow' , 'sort' , 'view' ,
151149 'enablejsapi' , 'wmode' , 'html5' , 'autoplay' , 'ar' ,
@@ -181,10 +179,9 @@ def make(cls, **kwargs):
181179 'wikipedia.org' : S (fkeep = True ),
182180 'scottaaronson.com' : S (qkeep = {'p' }, fkeep = True ),
183181 'urbandictionary.com' : S (qkeep = {'term' }),
184- 'ycombinator.com' : S (qkeep = {'id' }),
182+ 'ycombinator.com' : S (qkeep = {'id' }), # todo just keep id by default?
185183 'play.google.com' : S (qkeep = {'id' }),
186184 'answers.yahoo.com' : S (qkeep = {'qid' }),
187- 'ubuntuforums.org' : S (qkeep = {'t' }),
188185}
189186
190187_def_spec = S ()
@@ -410,8 +407,12 @@ def canonify(url: str) -> str:
410407 # frag = parts.fragment if spec.fkeep else ''
411408 frag = ''
412409
413- qq = [(k , v ) for k , v in qq if spec .keep_query (k )]
414- qq = list (sorted (qq )) # order matters in theory, but definitely not by default
410+ iqq = []
411+ for k , v in qq :
412+ order = spec .keep_query (k )
413+ if order is not None :
414+ iqq .append ((order , k , v ))
415+ qq = [(k , v ) for i , k , v in sorted (iqq )]
415416 # TODO still not sure what we should do..
416417 # quote_plus replaces %20 with +, not sure if we want it...
417418 query = urlencode (qq , quote_via = quote_via ) # type: ignore[type-var]
0 commit comments