third_party/cython/src/Cython/Compiler/StringEncoding.py - mojo - Git at Google

 #
 #   Cython -- encoding related tools
 #

 import re
 import sys

 if sys.version_info[0] >= 3:
     _unicode, _str, _bytes = str, str, bytes
     IS_PYTHON3 = True
 else:
     _unicode, _str, _bytes = unicode, str, str
     IS_PYTHON3 = False

 empty_bytes = _bytes()
 empty_unicode = _unicode()

 join_bytes = empty_bytes.join

 class UnicodeLiteralBuilder(object):
     """Assemble a unicode string.
     """
     def __init__(self):
         self.chars = []

     def append(self, characters):
         if isinstance(characters, _bytes):
             # this came from a Py2 string literal in the parser code
             characters = characters.decode("ASCII")
         assert isinstance(characters, _unicode), str(type(characters))
         self.chars.append(characters)

     if sys.maxunicode == 65535:
         def append_charval(self, char_number):
             if char_number > 65535:
                 # wide Unicode character on narrow platform => replace
                 # by surrogate pair
                 char_number -= 0x10000
                 self.chars.append( unichr((char_number // 1024) + 0xD800) )
                 self.chars.append( unichr((char_number  % 1024) + 0xDC00) )
             else:
                 self.chars.append( unichr(char_number) )
     else:
         def append_charval(self, char_number):
             self.chars.append( unichr(char_number) )

     def append_uescape(self, char_number, escape_string):
         self.append_charval(char_number)

     def getstring(self):
         return EncodedString(u''.join(self.chars))

     def getstrings(self):
         return (None, self.getstring())


 class BytesLiteralBuilder(object):
     """Assemble a byte string or char value.
     """
     def __init__(self, target_encoding):
         self.chars = []
         self.target_encoding = target_encoding

     def append(self, characters):
         if isinstance(characters, _unicode):
             characters = characters.encode(self.target_encoding)
         assert isinstance(characters, _bytes), str(type(characters))
         self.chars.append(characters)

     def append_charval(self, char_number):
         self.chars.append( unichr(char_number).encode('ISO-8859-1') )

     def append_uescape(self, char_number, escape_string):
         self.append(escape_string)

     def getstring(self):
         # this *must* return a byte string!
         s = BytesLiteral(join_bytes(self.chars))
         s.encoding = self.target_encoding
         return s

     def getchar(self):
         # this *must* return a byte string!
         return self.getstring()

     def getstrings(self):
         return (self.getstring(), None)

 class StrLiteralBuilder(object):
     """Assemble both a bytes and a unicode representation of a string.
     """
     def __init__(self, target_encoding):
         self._bytes   = BytesLiteralBuilder(target_encoding)
         self._unicode = UnicodeLiteralBuilder()

     def append(self, characters):
         self._bytes.append(characters)
         self._unicode.append(characters)

     def append_charval(self, char_number):
         self._bytes.append_charval(char_number)
         self._unicode.append_charval(char_number)

     def append_uescape(self, char_number, escape_string):
         self._bytes.append(escape_string)
         self._unicode.append_charval(char_number)

     def getstrings(self):
         return (self._bytes.getstring(), self._unicode.getstring())


 class EncodedString(_unicode):
     # unicode string subclass to keep track of the original encoding.
     # 'encoding' is None for unicode strings and the source encoding
     # otherwise
     encoding = None

     def __deepcopy__(self, memo):
         return self

     def byteencode(self):
         assert self.encoding is not None
         return self.encode(self.encoding)

     def utf8encode(self):
         assert self.encoding is None
         return self.encode("UTF-8")

     @property
     def is_unicode(self):
         return self.encoding is None

     def contains_surrogates(self):
         return string_contains_surrogates(self)


 def string_contains_surrogates(ustring):
     """
     Check if the unicode string contains surrogate code points
     on a CPython platform with wide (UCS-4) or narrow (UTF-16)
     Unicode, i.e. characters that would be spelled as two
     separate code units on a narrow platform.
     """
     for c in map(ord, ustring):
         if c > 65535:  # can only happen on wide platforms
             return True
         if 0xD800 <= c <= 0xDFFF:
             return True
     return False


 class BytesLiteral(_bytes):
     # bytes subclass that is compatible with EncodedString
     encoding = None

     def __deepcopy__(self, memo):
         return self

     def byteencode(self):
         if IS_PYTHON3:
             return _bytes(self)
         else:
             # fake-recode the string to make it a plain bytes object
             return self.decode('ISO-8859-1').encode('ISO-8859-1')

     def utf8encode(self):
         assert False, "this is not a unicode string: %r" % self

     def __str__(self):
         """Fake-decode the byte string to unicode to support %
         formatting of unicode strings.
         """
         return self.decode('ISO-8859-1')

     is_unicode = False


 char_from_escape_sequence = {
     r'\a' : u'\a',
     r'\b' : u'\b',
     r'\f' : u'\f',
     r'\n' : u'\n',
     r'\r' : u'\r',
     r'\t' : u'\t',
     r'\v' : u'\v',
     }.get

 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))


 def _to_escape_sequence(s):
     if s in '\n\r\t':
         return repr(s)[1:-1]
     elif s == '"':
         return r'\"'
     elif s == '\\':
         return r'\\'
     else:
         # within a character sequence, oct passes much better than hex
         return ''.join(['\\%03o' % ord(c) for c in s])


 def _build_specials_replacer():
     subexps = []
     replacements = {}
     for special in _c_special:
         regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
         subexps.append(regexp)
         replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
     sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
     def replace_specials(m):
         return replacements[m.group(1)]
     def replace(s):
         return sub(replace_specials, s)
     return replace

 _replace_specials = _build_specials_replacer()


 def escape_char(c):
     if IS_PYTHON3:
         c = c.decode('ISO-8859-1')
     if c in '\n\r\t\\':
         return repr(c)[1:-1]
     elif c == "'":
         return "\\'"
     n = ord(c)
     if n < 32 or n > 127:
         # hex works well for characters
         return "\\x%02X" % n
     else:
         return c

 def escape_byte_string(s):
     """Escape a byte string so that it can be written into C code.
     Note that this returns a Unicode string instead which, when
     encoded as ISO-8859-1, will result in the correct byte sequence
     being written.
     """
     s = _replace_specials(s)
     try:
         return s.decode("ASCII") # trial decoding: plain ASCII => done
     except UnicodeDecodeError:
         pass
     if IS_PYTHON3:
         s_new = bytearray()
         append, extend = s_new.append, s_new.extend
         for b in s:
             if b >= 128:
                 extend(('\\%3o' % b).encode('ASCII'))
             else:
                 append(b)
         return s_new.decode('ISO-8859-1')
     else:
         l = []
         append = l.append
         for c in s:
             o = ord(c)
             if o >= 128:
                 append('\\%3o' % o)
             else:
                 append(c)
         return join_bytes(l).decode('ISO-8859-1')

 def split_string_literal(s, limit=2000):
     # MSVC can't handle long string literals.
     if len(s) < limit:
         return s
     else:
         start = 0
         chunks = []
         while start < len(s):
             end = start + limit
             if len(s) > end-4 and '\\' in s[end-4:end]:
                 end -= 4 - s[end-4:end].find('\\') # just before the backslash
                 while s[end-1] == '\\':
                     end -= 1
                     if end == start:
                         # must have been a long line of backslashes
                         end = start + limit - (limit % 2) - 4
                         break
             chunks.append(s[start:end])
             start = end
         return '""'.join(chunks)

 def encode_pyunicode_string(s):
     """Create Py_UNICODE[] representation of a given unicode string.
     """
     s = map(ord, s) + [0]

     if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
         utf16, utf32 = [], s
         for code_point in s:
             if code_point >= 0x10000:  # outside of BMP
                 high, low = divmod(code_point - 0x10000, 1024)
                 utf16.append(high + 0xD800)
                 utf16.append(low + 0xDC00)
             else:
                 utf16.append(code_point)
     else:
         utf16, utf32 = s, []
         for code_unit in s:
             if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
                 high, low = utf32[-1], code_unit
                 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
             else:
                 utf32.append(code_unit)

     if utf16 == utf32:
         utf16 = []
     return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))
	#
	# Cython -- encoding related tools
	#

	import re
	import sys

	if sys.version_info[0] >= 3:
	_unicode, _str, _bytes = str, str, bytes
	IS_PYTHON3 = True
	else:
	_unicode, _str, _bytes = unicode, str, str
	IS_PYTHON3 = False

	empty_bytes = _bytes()
	empty_unicode = _unicode()

	join_bytes = empty_bytes.join

	class UnicodeLiteralBuilder(object):
	"""Assemble a unicode string.
	"""
	def __init__(self):
	self.chars = []

	def append(self, characters):
	if isinstance(characters, _bytes):
	# this came from a Py2 string literal in the parser code
	characters = characters.decode("ASCII")
	assert isinstance(characters, _unicode), str(type(characters))
	self.chars.append(characters)

	if sys.maxunicode == 65535:
	def append_charval(self, char_number):
	if char_number > 65535:
	# wide Unicode character on narrow platform => replace
	# by surrogate pair
	char_number -= 0x10000
	self.chars.append( unichr((char_number // 1024) + 0xD800) )
	self.chars.append( unichr((char_number % 1024) + 0xDC00) )
	else:
	self.chars.append( unichr(char_number) )
	else:
	def append_charval(self, char_number):
	self.chars.append( unichr(char_number) )

	def append_uescape(self, char_number, escape_string):
	self.append_charval(char_number)

	def getstring(self):
	return EncodedString(u''.join(self.chars))

	def getstrings(self):
	return (None, self.getstring())


	class BytesLiteralBuilder(object):
	"""Assemble a byte string or char value.
	"""
	def __init__(self, target_encoding):
	self.chars = []
	self.target_encoding = target_encoding

	def append(self, characters):
	if isinstance(characters, _unicode):
	characters = characters.encode(self.target_encoding)
	assert isinstance(characters, _bytes), str(type(characters))
	self.chars.append(characters)

	def append_charval(self, char_number):
	self.chars.append( unichr(char_number).encode('ISO-8859-1') )

	def append_uescape(self, char_number, escape_string):
	self.append(escape_string)

	def getstring(self):
	# this must return a byte string!
	s = BytesLiteral(join_bytes(self.chars))
	s.encoding = self.target_encoding
	return s

	def getchar(self):
	# this must return a byte string!
	return self.getstring()

	def getstrings(self):
	return (self.getstring(), None)

	class StrLiteralBuilder(object):
	"""Assemble both a bytes and a unicode representation of a string.
	"""
	def __init__(self, target_encoding):
	self._bytes = BytesLiteralBuilder(target_encoding)
	self._unicode = UnicodeLiteralBuilder()

	def append(self, characters):
	self._bytes.append(characters)
	self._unicode.append(characters)

	def append_charval(self, char_number):
	self._bytes.append_charval(char_number)
	self._unicode.append_charval(char_number)

	def append_uescape(self, char_number, escape_string):
	self._bytes.append(escape_string)
	self._unicode.append_charval(char_number)

	def getstrings(self):
	return (self._bytes.getstring(), self._unicode.getstring())


	class EncodedString(_unicode):
	# unicode string subclass to keep track of the original encoding.
	# 'encoding' is None for unicode strings and the source encoding
	# otherwise
	encoding = None

	def __deepcopy__(self, memo):
	return self

	def byteencode(self):
	assert self.encoding is not None
	return self.encode(self.encoding)

	def utf8encode(self):
	assert self.encoding is None
	return self.encode("UTF-8")

	@property
	def is_unicode(self):
	return self.encoding is None

	def contains_surrogates(self):
	return string_contains_surrogates(self)


	def string_contains_surrogates(ustring):
	"""
	Check if the unicode string contains surrogate code points
	on a CPython platform with wide (UCS-4) or narrow (UTF-16)
	Unicode, i.e. characters that would be spelled as two
	separate code units on a narrow platform.
	"""
	for c in map(ord, ustring):
	if c > 65535: # can only happen on wide platforms
	return True
	if 0xD800 <= c <= 0xDFFF:
	return True
	return False


	class BytesLiteral(_bytes):
	# bytes subclass that is compatible with EncodedString
	encoding = None

	def __deepcopy__(self, memo):
	return self

	def byteencode(self):
	if IS_PYTHON3:
	return _bytes(self)
	else:
	# fake-recode the string to make it a plain bytes object
	return self.decode('ISO-8859-1').encode('ISO-8859-1')

	def utf8encode(self):
	assert False, "this is not a unicode string: %r" % self

	def __str__(self):
	"""Fake-decode the byte string to unicode to support %
	formatting of unicode strings.
	"""
	return self.decode('ISO-8859-1')

	is_unicode = False


	char_from_escape_sequence = {
	r'\a' : u'\a',
	r'\b' : u'\b',
	r'\f' : u'\f',
	r'\n' : u'\n',
	r'\r' : u'\r',
	r'\t' : u'\t',
	r'\v' : u'\v',
	}.get

	_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))


	def _to_escape_sequence(s):
	if s in '\n\r\t':
	return repr(s)[1:-1]
	elif s == '"':
	return r'\"'
	elif s == '\\':
	return r'\\'
	else:
	# within a character sequence, oct passes much better than hex
	return ''.join(['\\%03o' % ord(c) for c in s])


	def _build_specials_replacer():
	subexps = []
	replacements = {}
	for special in _c_special:
	regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
	subexps.append(regexp)
	replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
	sub = re.compile(('(%s)' % '\|'.join(subexps)).encode('ASCII')).sub
	def replace_specials(m):
	return replacements[m.group(1)]
	def replace(s):
	return sub(replace_specials, s)
	return replace

	_replace_specials = _build_specials_replacer()


	def escape_char(c):
	if IS_PYTHON3:
	c = c.decode('ISO-8859-1')
	if c in '\n\r\t\\':
	return repr(c)[1:-1]
	elif c == "'":
	return "\\'"
	n = ord(c)
	if n < 32 or n > 127:
	# hex works well for characters
	return "\\x%02X" % n
	else:
	return c

	def escape_byte_string(s):
	"""Escape a byte string so that it can be written into C code.
	Note that this returns a Unicode string instead which, when
	encoded as ISO-8859-1, will result in the correct byte sequence
	being written.
	"""
	s = _replace_specials(s)
	try:
	return s.decode("ASCII") # trial decoding: plain ASCII => done
	except UnicodeDecodeError:
	pass
	if IS_PYTHON3:
	s_new = bytearray()
	append, extend = s_new.append, s_new.extend
	for b in s:
	if b >= 128:
	extend(('\\%3o' % b).encode('ASCII'))
	else:
	append(b)
	return s_new.decode('ISO-8859-1')
	else:
	l = []
	append = l.append
	for c in s:
	o = ord(c)
	if o >= 128:
	append('\\%3o' % o)
	else:
	append(c)
	return join_bytes(l).decode('ISO-8859-1')

	def split_string_literal(s, limit=2000):
	# MSVC can't handle long string literals.
	if len(s) < limit:
	return s
	else:
	start = 0
	chunks = []
	while start < len(s):
	end = start + limit
	if len(s) > end-4 and '\\' in s[end-4:end]:
	end -= 4 - s[end-4:end].find('\\') # just before the backslash
	while s[end-1] == '\\':
	end -= 1
	if end == start:
	# must have been a long line of backslashes
	end = start + limit - (limit % 2) - 4
	break
	chunks.append(s[start:end])
	start = end
	return '""'.join(chunks)

	def encode_pyunicode_string(s):
	"""Create Py_UNICODE[] representation of a given unicode string.
	"""
	s = map(ord, s) + [0]

	if sys.maxunicode >= 0x10000: # Wide build or Py3.3
	utf16, utf32 = [], s
	for code_point in s:
	if code_point >= 0x10000: # outside of BMP
	high, low = divmod(code_point - 0x10000, 1024)
	utf16.append(high + 0xD800)
	utf16.append(low + 0xDC00)
	else:
	utf16.append(code_point)
	else:
	utf16, utf32 = s, []
	for code_unit in s:
	if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
	high, low = utf32[-1], code_unit
	utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
	else:
	utf32.append(code_unit)

	if utf16 == utf32:
	utf16 = []
	return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))