Source code for gaetk2.tools.unicode

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
gaetk2/tools/unicode.py

Copyright (c) 2007, 2015, 2016 HUDORA GmbH. BSD Licensed.
"""

from __future__ import unicode_literals

import doctest
import re
import string
import sys
import unicodedata

from types import StringType

from gaetk2.tools import unicode_helper_latin1


[docs]def de_utf8(data): """This is meant to help with utf-8 data appearing where unicode should apperar.""" # In particular my DB is returning the wrong thing. if isinstance(data, StringType): return data.decode('utf-8') return data
# native, HTML, default Unicode (Code page 850), Unicode combined Character, Windows-1250 _recodings = {'ae': ['ä', 'ä', 'ä', '\u00E4', '\u00E4', '\u0308a', '\xc3\xa4'], 'oe': ['ö', 'ö', 'ö', '\u00F6', '\u00F6', '\u0308o', '\xc3\xb6', 'á'], 'ue': ['ü', 'ü', 'ü', '\u00FC', '\u00FC', '\u0308u', '\xc3\xbc'], 'Ae': ['Ä', 'Ä', 'Ä', '\u00C4', '\u00C4', '\u0308A', '\xc3\x84'], 'Oe': ['Ö', 'Ö', 'Ö', '\u00D6', '\u00D6', '\u0308O', '\xc3\x96', 'Ó'], 'Ue': ['Ü', 'Ü', 'Ü', '\u00DC', '\u00DC', '\u0308U', '\xc3\x9c'], 'ss': ['ß', 'ß', 'ß', '\u00DF', '\u00DF', '\xc3\x9f', 'ß'], 'e': ['é', 'é', '\xc3\xa9', 'ê', 'è'], 'i': ['í', 'í'], 'E': ['É', 'È'], "'": ['´', '´', '`', '`'], }
[docs]def de_umlaut(data): """Converts a text to ASCII acting smart about Umlauts. >>> de_umlaut('1 Über Hügel saß René äöüÄÖÜß') '1 Ueber Huegel sass Rene aeoeueAeOeUess' """ # maybe https://pypi.python.org/pypi/Unidecode is a nice addition? for to_char, from_chars in _recodings.items(): for from_char in from_chars: try: data = data.replace(from_char, to_char) except UnicodeDecodeError: data = data # data = unidecode(data) try: data = unicodedata.normalize('NFKD', data) except TypeError: pass try: return data.encode('ascii', 'replace') except UnicodeEncodeError as msg: raise ValueError('%s: %r' % (msg, data)) except UnicodeDecodeError as msg: raise ValueError('%s: %r' % (msg, data))
# see http://instagram-engineering.tumblr.com/post/118304328152 # to learn more about the mess try: HIGHPOINTS = re.compile('[\U00010000-\U0010ffff]') EMOJI = re.compile('([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])') except re.error: # UCS-2 build HIGHPOINTS = re.compile('[\uD800-\uDBFF][\uDC00-\uDFFF]') EMOJI = re.compile( '([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])') DASHES = '' # Crap from Windows-1250: ‚„…‰Š‹ŚŤŽŹ‘’“”•–—™›ˇ¦©«®·»śťžźĄąĽľ # Crap fom Unicode: … REPLACRS = { # see https://www.cs.tut.fi/~jkorpela/chars/spaces.html ' ': '\u00A0' # NO-BREAK SPACE foo bar As a space, but often not adjusted '\u2000' # EN QUAD foo bar 1 en (= 1/2 em) '\u2001' # EM QUAD foo bar 1 em (nominally, the height of the font) '\u2002' # EN SPACE foo bar 1 en (= 1/2 em) '\u2003' # EM SPACE foo bar 1 em '\u2007' # FIGURE SPACE foo bar “Tabular width”, the width of digits '\u2004' # THREE-PER-EM SPACE foo bar 1/3 em '\u2005' # FOUR-PER-EM SPACE foo bar 1/4 em '\u2006' # SIX-PER-EM SPACE foo bar 1/6 em '\u2008' # PUNCTUATION SPACE foo bar The width of a period “.” '\u2009' # THIN SPACE foo bar 1/5 em (or sometimes 1/6 em) '\u200A' # HAIR SPACE foo bar Narrower than THIN SPACE '\u205F' # MEDIUM MATHEMATICAL SPACE foo bar 4/18 em '\u3000', # IDEOGRAPHIC SPACE foo bar The width of ideographic (CJK) characters. '': '\u1680' # OGHAM SPACE MARK foo bar Unspecified; usually not really a space but a dash '\u180E' # MONGOLIAN VOWEL SEPARATOR foo᠎bar No width '\u200B' # ZERO WIDTH SPACE foo​bar Nominally no width, but may expand '\u202F' # NARROW NO-BREAK SPACE foo bar Narrower than NO-BREAK SPACE (or SPACE) '\uFEFF', # ZERO WIDTH NO-BREAK SPACE foobar No width (the character is invisible) '-': '-‐‑–‒—―', # see https://www.cs.tut.fi/~jkorpela/dashes.html '_': '_﹏', '*': '※⁕⁜*⁎∗·•◦‣⦿⦾⁃◘', '!': '!¡‼❕', '?': '¿⁇❓❔؟‽', '~': '~〜', '&': '&&⅋﹠', "'": 'ʻˮ՚Ꞌꞌ‘’′', '"': '“”″‴〃„', '/': '/\⁄\\', '|': '¦|', '<': '‹«', '>': '›»', } # this could be an one liner REVERSEREPLACRS = {} for result, inputs in REPLACRS.iteritems(): for c in inputs: REVERSEREPLACRS[c] = result REPLACE_RE = re.compile('[%s]' % ''.join([re.escape(x) for x in REVERSEREPLACRS.keys()]))
[docs]def de_noise(data): """Removes all stuff which should not appear in normal Western Text. >>> de_noise(u'»Susie`s Giga\\Super-Markt®¿«') u">Susie's Giga/Super-Markt(R)?<" >>> de_noise(u"ümlaut eins:\x01") u'\\xfcmlaut eins:' >>> de_noise(u'«A» {C} ¿D? „E“ ›F‹') u'<A> (C) ?D? "E" >F<' >>> de_noise(u'`A´') u"'A'" >>> de_noise(u'«😎» Umlaute kann doctest !gut {®} ¿👩‍👩‍👧‍👦? „👨‍❤️‍💋‍👨“ ›🎅🏻🎅🏼🎅🏽🎅🏾🎅🏿‹') u'<> Umlaute kann doctest !gut ((R)) ?? "" ><' >>> de_noise(u'DE37  330 5 13 50 0 010  4414  22') u'DE37330513500010441422' """ data = unicodedata.normalize('NFC', unicode(data)) def noiserepl(matchobj): return REVERSEREPLACRS.get(matchobj.group(0), '_') data = REPLACE_RE.sub(noiserepl, data) data = unicode_helper_latin1.de_noise_latin1(data) # data = unicodedata.normalize('NFKC', data) # decruft a little but keep umlauts return data
# from http://code.activestate.com/recipes/577257/ _SLUGIFY_STRIP_RE = re.compile(r'[^\w\s-]') _SLUGIFY_HYPHENATE_RE = re.compile(r'[-\s]+')
[docs]def slugify(value): """Converts a string to be usable in URLs without excaping. Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. Inspired by Django's "django/template/defaultfilters.py". """ if value is None: return '' value = de_noise(de_umlaut(value)) value = unicodedata.normalize('NFKD', unicode(value)).encode('ascii', 'ignore') value = unicode(_SLUGIFY_STRIP_RE.sub('', value).strip().lower()) return _SLUGIFY_HYPHENATE_RE.sub('-', value)
# from http://stackoverflow.com/questions/561486 ALPHABET = string.digits + string.ascii_uppercase + string.ascii_lowercase ALPHABET_REVERSE = {c: i for (i, c) in enumerate(ALPHABET)} BASE = len(ALPHABET) SHORTALPHABET = string.digits + string.ascii_uppercase SHORTALPHABET_REVERSE = {c: i for (i, c) in enumerate(SHORTALPHABET)} SHORTBASE = len(SHORTALPHABET) SIGN_CHARACTER = '$'
[docs]def num_encode(n): """Convert an integer to an base62 encoded string.""" if n < 0: return SIGN_CHARACTER + num_encode(-n) s = [] while True: n, r = divmod(n, BASE) s.append(ALPHABET[r]) if n == 0: break return ''.join(reversed(s))
[docs]def num_decode(s): """Convert the result of num_encode() back to an integer.""" if s[0] == SIGN_CHARACTER: return -num_decode(s[1:]) n = 0 for c in s: n = n * BASE + ALPHABET_REVERSE[c] return n
[docs]def num_encode_uppercase(n): """Convert an integer to an base36 (unly uppercase and numbers) encoded string.""" if n < 0: return SIGN_CHARACTER + num_encode_uppercase(-n) s = [] while True: n, r = divmod(n, SHORTBASE) s.append(SHORTALPHABET[r]) if n == 0: break return ''.join(reversed(s))
if __name__ == '__main__': print de_noise("`Iñtërnâtiônàlizætiøn!'") print de_noise('«😎» `Iñtërnâtiônàlizætiøn´ {®} ¿👩‍👩‍👧‍👦? „👨‍❤️‍💋‍👨“ ›🎅🏻🎅🏼🎅🏽🎅🏾🎅🏿‹') failure_count, test_count = doctest.testmod() sys.exit(failure_count)