python-cheat/string_cheat.py at master · cirosantilli/python-cheat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
## string

## str'

There are 2 commonly used classes which are informaly called *strings* in python:
*str* and *unicode*

*basestring* is their common ancestor.
"""

import sys

assert isinstance(str(), basestring)
assert isinstance(unicode(), basestring)
assert not isinstance(bytearray(), basestring)

if '## string literals':

    if '## Single vs double quotes':

        # There is no semantic difference:

        assert 'abc' == 'abc'

        # Except for escaping quotes themselves:

        assert "'" == '\''
        assert '"' == "\""
        assert "\n" == '\n'

        # Advantages of single quote `'`:

        # - prints as less pixels so less noisy
        # - easier to type on a standard keyboard since no shift required

        # PEP 8 and Google Style say choose one and stick with it.
        # Only use the other to avoid backslash escaping the one.

        # Other styles promote a semantic differentiation:

        # - `'` for identifiers, e.g., map keys
        # - `"` for human readable messages: `print "Hello world!"`

        # But this differentiation harder to maintain.

    if '## multiline strings ## triple quotes':

        # Like in C, whitespace separated strings are glued together:

        assert 'ab' 'cd' == 'abcd'

        # This means that backslash continuation joins strings without newlines:

        assert \
            'ab' \
            'cd' == 'abcd'

        assert """a
b""" == 'a\nb'

        assert '''a
b''' == 'a\nb'

        # Spaces are kept:

        assert '''a
 b''' == 'a\n b'

        def f():
            assert """a
b""" == 'a\nb'
        f()

    # Backslash escapes are like in C.

    assert '\x61' == 'a'
    assert '\n' == '\x0A'

    if '## raw string literals ## r literals':

        # Raw literals lose all backslash escape magic.

        assert r'\n' == '\\n'
        assert r'\\' == '\\\\'

        # The exception is to escape the character that would terminate the string (`'` or `"`).
        # In that case, the backslash *remains* in the string:

        assert r'\"' == "\\\""

        # A consequence is that it is impossible to write a raw string literal that ends in backslash.

        assert '\\' != r''

        # Raw string literals are often used with regular expressions,
        # which often contain many literal backslashes.

# Character access is like for a list:

assert 'ab'[0] == 'a'
assert 'ab'[1] == 'b'

# Unlike lists, strings are immutable, to it is not possible to assign to an element,
# or `TypeError` is raised.

try:
    'ab'[0] = 'a'
except TypeError:
    pass
else:
    assert False

if '## Concatenate':

    # Used to be inefficient because strings are immutable, but CPython improved it:
    # http://stackoverflow.com/questions/4435169/good-way-to-append-to-a-string

    # If you are really concerned, use and array and then join at the end.

    assert 'ab' + 'cd' == 'abcd'

# For string literals:

assert 'ab' 'cd' == 'abcd'
assert 'ab' * 3 == 'ababab'

# `replace`: replaces at most once:

assert 'aabbcc'.replace('b',  '0')    == 'aa00cc'
assert 'aabbcc'.replace('bb', '0')    == 'aa0cc'
assert 'aabbcc'.replace('b',  '0', 1) == 'aa0bcc'

if '## string module':

    import string

    if '## constants':

        print 'string.whitespace = ' + string.whitespace.encode('string-escape')

if '## join':

        assert '0'.join(['a', 'b', 'c']) == 'a0b0c'

    # Why not a method of list?
    # http://stackoverflow.com/questions/493819/python-join-why-is-it-string-joinlist-instead-of-list-joinstring

if '## split':

    # Split string into array of strings:

    assert '0ab1ab2'.split('ab') == ['0', '1', '2']
    assert '0abab2'.split('ab')  == ['0', '', '2']

    # If string not given, splits at `string.whitespace*` **regex**!:
    # Very confusing default that changes behaviour completely!
    # But kind of useful default.

    assert '0 1\t \n2'.split() == ['0', '1', '2']

    if '## splitlines':

        # Split at `(\n\r?|\r)` regex, *and* exclude the last empty string that split
        # would generate if there was a trailing newline.

        assert '0\n1\r2\r\n3\n'.splitlines()  == ['0', '1', '2', '3']

if '## strip ## rstrip ## lstrip':

    """
    Strip chars either from either beginning or end, *not* middle!

    Characters to strip are given on a string.

    Default argument: `string.whitespace`

    r and l strip for one sided versions.
    """

    assert 'cbaba0a1b2ccba'.strip('abc') == '0a1b2'
    assert '\t\n0 1 2\v \r'.strip() == '0 1 2'

    if '## chomp':

        """
        chomp is a Perl function that takes one newline off the end.

        Therefore it is not equivalent to chomp!

        Best way to do it: http://stackoverflow.com/a/19531239/895245
        """

if '## startswith':

    assert 'abc'.startswith('ab') == True
    assert 'abc'.startswith('bc') == False

    # Remove prefix: <http://stackoverflow.com/questions/599953/python-how-to-remove-the-left-part-of-a-string>

    # If sure that the prefix is there:

    prefix = 'ab'
    assert 'abcd'[len(prefix):] == 'cd'

    # Otherwise:

    prefix = 'ab'
    s = 'abcd'
    if s.startswith(prefix):
        assert s[len(prefix):] == 'cd'

if '## contains substring':
    assert 'bc' in 'abcd'
    assert 'bc' not in 'abdc'
    # The empty string is contained in all others:
    assert '' in ''
    assert '' in 'a'

# String to number:

assert int('123') == 123
assert float('12.34e56') == 12.34e56

# Char to int:

assert ord('a') == 97

# Encode:

assert '\n'.encode('string-escape') == '\\n'

# `string-escape` is similar to `repr`.

if '## unicode ## encodings':

    """
    Before reading this you should understand what is ASCII, Unicode,
    UTF8, UTF16.

    The difference between the `unicode` and `str` classes is that:

    -   `str` is just an array of bytes.

        These could represent ASCII chars since those fit into 1 byte,
        but they could also represent UTF8 chars.

        If they represent UTF8 chars, which may have more than 1 byte per char,
        the str class has no way to know where each char begins and ends,
        so s[0] give gibberish.

        `str` is the output of an encode operation, or the input of a decode operation.

    -   `unicode`: represents actual Unicode characters.

        Unicode strings do not have an explicit encoding,
        although Python probably uses one int per char containing the Unicode code of that character.

        `unicode` is the output of a decode operation, or the input of an encode operation.
    """

    """
    To be able to use utf8 directly in Python source.
    The second line of the file *must* be:

        -*- coding: utf-8 -*-

    Otherwise, you must use escape characters.

    This changes in Python 3 where utf-8 becomes the default encoding.
    """

    if '## u backslash escapes ## unicode literals':

        """
        Unicode literals are just like string literals, but they start with `u`.

        The string below has 2 characters. Characters are treated differently depending on
        if strings are str or unicode.
        """

        u = u'\u4E2D\u6587'
        assert u[0] == u'\u4E2D'
        assert u[1] == u'\u6587'

        """
        Each escape character represents exactly one Unicode character,
        however some escapes cannot represent all characters.
        The possile escapes are:

        -   `\xAA` represents one character of up to one byte.

            This is not very useful with Unicode, since most of those characters
            have a printable and therefore more readable ASCII char to represent them.

            Characters with more than 1 byte cannot be represented with a `\xAA` escape.

        -   `\uAAAA`: 2 bytes.

            This is the most useful escape, as the most common unicode code points are
            use either one or 2 bytes.

        -   `\UAAAAAAAA`: 4 bytes

            It is very rare to have to use `\UAAAAAAAA` literals,
            since Unicode plane 0 which contains the most common characters
            fit into one byte.

            Also note that `\U0010FFFF` is the largest possible character:
            the first byte must always be 0, since that is as far as Unicode goes.

        Remember: `\` escapes are interpreted inside multiline comment strings.
        Therefore, if you write an invalid escape like `\\xYY`, your code will not run!
        """

        assert u'\u4E2D\u6587' == u'中文'
        assert u'\U00010000' == u'𐀀'

        """
        A is done to confirm that a byte is a known unicode character.
        For example `\UAAAAAAAA` does not currently represent any Unicode character,
        so you cannot use it.
        """

        #u'\UAAAAAAAA'

        #Unicode \u escapes are only interpreted inside unicode string literals.

        s = '\u4E2D\u6587'
        assert s[0] == '\\'
        assert s[1] == 'u'
        assert s[2] == '4'

    """
    ## encode

    ## decode

        Encode transforms an `unicode` string to a byte string `str`.

        Decode transforms a byte string `str` to an `unicode` string.
    """

    assert u'中'.encode('utf-8') == '\xE4\xB8\xAD'
    assert u'中' == '\xE4\xB8\xAD'.decode('utf-8')

    # Most escapes in str literals strings are also interpreted inside unicode strings.

    assert u'\n'.encode('ASCII') == '\n'

    # When mixing encodings implicily, ASCII is assumed by default,
    # so things break only if there are non-ASCII chars.
    # Don't do any of the following:

    assert u'a' == 'a'
    assert u'\u20AC' != '\x20\xAC'

    try:
        str(u'\u20AC')
    except UnicodeEncodeError:
        #'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)
        pass
    else:
        raise

    try:
        assert u'\u20AC'.decode('utf-8')
    except UnicodeEncodeError:
        #'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)
        pass
    else:
        raise

    try:
        unicode('\x20\xAC')
    except UnicodeDecodeError:
        #'ascii' codec can't decode byte 0xac in position 1: ordinal not in range(128)
        pass
    else:
        raise

    """
    ## Normalization

        Some unicode characters can be represented by multiple sequences.

        This is so for backwards compatibility with older encodings,
        and happens most often for accentuated versions of latin characters.

        unicode strings with different normalizations compare False.

        Normalization may be modified via `unicodedata`.
    """

    assert u'\u00EAtre' != u'e\u0302tre'

    import unicodedata
    assert unicodedata.normalize('NFC', u'\u00eatre') == unicodedata.normalize('NFC', u'e\u0302tre')

    """
    IO is typically done via arrays of bytes since that is how the system really sees it,
    and not via unicode chars.

    This includes operations like:

    - print
    - sys.stdout.write
    - file open + write

    There may be however some convenience wrappers that deal with encoding.
    For example, `codecs.open` opens a file in a encoding aware manner.
    """

    """
    ## Unicode and file IO

        First remember that `sys.stdout` is a file object,
        so terminal and file IO is much the same.

        Terminal output via `print` or `sys.stdout.write` always uses str byte strings.

        If given unicode, it first decodes via `sys.stdout.encoding`

        TODO how sys.stdout.encoding is determined
        TODO pipes affect `sys.stdout.encoding`?

        If print output goes to a pipe, `sys.stdout.encoding` is `None`,
        in which case `ASCII` conversion is attempted.
        If there are any non ASCII characters, this leads to an exception!
        Therefore, if it is ever possible that there could be unicode chars
        on the output string, encode it explicitly.
    """

    print 'sys.stdout.encoding = ' + str(sys.stdout.encoding)

    # BAD: will raise an exception if output to a pipe!

    #print u'中文'

    # GOOD:

    print u'中文'.encode('utf-8')

if '## lower and upper case':

    assert 'aBcD'.lower() == 'abcd'
    assert 'aBcD'.upper() == 'ABCD'