268 lines
9.0 KiB
Python
268 lines
9.0 KiB
Python
import java.lang.Character
|
|
try:
|
|
# import from jarjar-ed version
|
|
from org.python.icu.text import Normalizer
|
|
from org.python.icu.lang import UCharacter, UProperty
|
|
from org.python.icu.util import VersionInfo
|
|
from org.python.icu.lang.UCharacter import EastAsianWidth, DecompositionType
|
|
from org.python.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
|
|
except ImportError:
|
|
# development version of Jython, so use extlibs
|
|
from com.ibm.icu.text import Normalizer
|
|
from com.ibm.icu.lang import UCharacter, UProperty
|
|
from com.ibm.icu.util import VersionInfo
|
|
from com.ibm.icu.lang.UCharacter import EastAsianWidth, DecompositionType
|
|
from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
|
|
|
|
|
|
__all__ = (
|
|
"bidirectional", "category", "combining", "decimal", "decomposition", "digit", "east_asian_width",
|
|
"lookup", "mirrored", "name", "normalize", "numeric", "unidata_version")
|
|
|
|
|
|
_forms = {
|
|
'NFC': Normalizer.NFC,
|
|
'NFKC': Normalizer.NFKC,
|
|
'NFD': Normalizer.NFD,
|
|
'NFKD': Normalizer.NFKD
|
|
}
|
|
|
|
Nonesuch = object() # to distinguish from None, which is a valid return value for some functions
|
|
|
|
|
|
def _validate_unichr(unichr):
|
|
if not(isinstance(unichr, unicode)):
|
|
raise TypeError("must be unicode, not {}".format(type(unichr).__name__))
|
|
if len(unichr) > 1 or len(unichr) == 0:
|
|
raise TypeError("need a single Unicode character as parameter")
|
|
|
|
|
|
def _get_codepoint(unichr):
|
|
_validate_unichr(unichr)
|
|
return ord(unichr)
|
|
|
|
|
|
def name(unichr, default=Nonesuch):
|
|
# handle None
|
|
n = UCharacter.getName(_get_codepoint(unichr))
|
|
if n is None:
|
|
if default is not Nonesuch:
|
|
return default
|
|
else:
|
|
raise ValueError("no such name")
|
|
return n
|
|
|
|
|
|
def lookup(name):
|
|
codepoint = UCharacter.getCharFromName(name)
|
|
if codepoint == -1:
|
|
raise KeyError("undefined character name '{}".format(name))
|
|
return unichr(codepoint)
|
|
|
|
|
|
def digit(unichr, default=Nonesuch):
|
|
d = UCharacter.digit(_get_codepoint(unichr))
|
|
if d == -1:
|
|
if default is not Nonesuch:
|
|
return default
|
|
else:
|
|
raise ValueError("not a digit")
|
|
return d
|
|
|
|
|
|
def decimal(unichr, default=Nonesuch):
|
|
d = UCharacter.getNumericValue(_get_codepoint(unichr))
|
|
if d < 0 or d > 9:
|
|
if default is not Nonesuch:
|
|
return default
|
|
else:
|
|
raise ValueError("not a decimal")
|
|
return d
|
|
|
|
|
|
def numeric(unichr, default=Nonesuch):
|
|
n = UCharacter.getUnicodeNumericValue(_get_codepoint(unichr))
|
|
if n == UCharacter.NO_NUMERIC_VALUE:
|
|
if default is not Nonesuch:
|
|
return default
|
|
else:
|
|
raise ValueError("not a numeric")
|
|
return n
|
|
|
|
|
|
_decomp = {
|
|
DecompositionType.CANONICAL: "canonical",
|
|
DecompositionType.CIRCLE: "circle",
|
|
DecompositionType.COMPAT: "compat",
|
|
DecompositionType.FINAL: "final",
|
|
DecompositionType.FONT: "font",
|
|
DecompositionType.FRACTION: "fraction",
|
|
DecompositionType.INITIAL: "initial",
|
|
DecompositionType.ISOLATED: "isolated",
|
|
DecompositionType.MEDIAL: "medial",
|
|
DecompositionType.NARROW: "narrow",
|
|
DecompositionType.NOBREAK: "nobreak",
|
|
DecompositionType.NONE: None,
|
|
DecompositionType.SMALL: "small",
|
|
DecompositionType.SQUARE: "square",
|
|
DecompositionType.SUB: "sub",
|
|
DecompositionType.SUPER: "super",
|
|
DecompositionType.VERTICAL: "vertical",
|
|
DecompositionType.WIDE: "wide"
|
|
}
|
|
|
|
def _get_decomp_type(unichr):
|
|
if unichr == u"\u2044": # FRACTION SLASH
|
|
# special case this for CPython compatibility even though this returns as not being combining, eg, see
|
|
# http://www.fileformat.info/info/unicode/char/2044/index.htm
|
|
return "fraction"
|
|
else:
|
|
return _decomp[UCharacter.getIntPropertyValue(ord(unichr), UProperty.DECOMPOSITION_TYPE)]
|
|
|
|
def decomposition(unichr):
|
|
_validate_unichr(unichr)
|
|
d = Normalizer.decompose(unichr, True)
|
|
decomp_type = None
|
|
if len(d) == 1:
|
|
decomp_type = _get_decomp_type(unichr)
|
|
else:
|
|
for c in d:
|
|
decomp_type = _get_decomp_type(c)
|
|
# print "Got a decomp_type %r %r %r" % (c, d, decomp_type)
|
|
if decomp_type is not None:
|
|
break
|
|
hexed = " ".join(("{0:04X}".format(ord(c)) for c in d))
|
|
if decomp_type:
|
|
return "<{}> {}".format(decomp_type, hexed)
|
|
elif len(d) == 1:
|
|
return ""
|
|
else:
|
|
return hexed
|
|
|
|
|
|
# To map from ICU4J enumerations for category, bidirection, and
|
|
# east_asian_width to the underlying property values that Python uses
|
|
# from UnicodeData.txt required a manual mapping between the following
|
|
# two files:
|
|
#
|
|
# http://icu-project.org/apiref/icu4j/constant-values.html
|
|
# http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt
|
|
|
|
_cat = {
|
|
ECharacterCategory.COMBINING_SPACING_MARK: "Mc",
|
|
ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc",
|
|
ECharacterCategory.CONTROL: "Cc",
|
|
ECharacterCategory.CURRENCY_SYMBOL: "Sc",
|
|
ECharacterCategory.DASH_PUNCTUATION: "Pd",
|
|
ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd",
|
|
ECharacterCategory.ENCLOSING_MARK: "Me",
|
|
ECharacterCategory.END_PUNCTUATION: "Pe",
|
|
ECharacterCategory.FINAL_PUNCTUATION: "Pf",
|
|
ECharacterCategory.FORMAT: "Cf",
|
|
# per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES
|
|
# - no characters in [UnicodeData.txt] have this property
|
|
ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned",
|
|
ECharacterCategory.INITIAL_PUNCTUATION: "Pi",
|
|
ECharacterCategory.LETTER_NUMBER: "Nl",
|
|
ECharacterCategory.LINE_SEPARATOR: "Zl",
|
|
ECharacterCategory.LOWERCASE_LETTER: "Ll",
|
|
ECharacterCategory.MATH_SYMBOL: "Sm",
|
|
ECharacterCategory.MODIFIER_LETTER: "Lm",
|
|
ECharacterCategory.MODIFIER_SYMBOL: "Sk",
|
|
ECharacterCategory.NON_SPACING_MARK: "Mn",
|
|
ECharacterCategory.OTHER_LETTER: "Lo",
|
|
ECharacterCategory.OTHER_NUMBER: "No",
|
|
ECharacterCategory.OTHER_PUNCTUATION: "Po",
|
|
ECharacterCategory.OTHER_SYMBOL: "So",
|
|
ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp",
|
|
ECharacterCategory.PRIVATE_USE: "Co",
|
|
ECharacterCategory.SPACE_SEPARATOR: "Zs",
|
|
ECharacterCategory.START_PUNCTUATION: "Ps",
|
|
ECharacterCategory.SURROGATE: "Cs",
|
|
ECharacterCategory.TITLECASE_LETTER: "Lt",
|
|
ECharacterCategory.UNASSIGNED: "Cn",
|
|
ECharacterCategory.UPPERCASE_LETTER: "Lu",
|
|
}
|
|
|
|
def category(unichr):
|
|
return _cat[UCharacter.getType(_get_codepoint(unichr))]
|
|
|
|
|
|
_dir = {
|
|
ECharacterDirection.ARABIC_NUMBER: "An",
|
|
ECharacterDirection.BLOCK_SEPARATOR: "B",
|
|
ECharacterDirection.BOUNDARY_NEUTRAL: "BN",
|
|
ECharacterDirection.COMMON_NUMBER_SEPARATOR: "CS",
|
|
ECharacterDirection.DIR_NON_SPACING_MARK: "NSM",
|
|
ECharacterDirection.EUROPEAN_NUMBER: "EN",
|
|
ECharacterDirection.EUROPEAN_NUMBER_SEPARATOR: "ES",
|
|
ECharacterDirection.EUROPEAN_NUMBER_TERMINATOR: "ET",
|
|
ECharacterDirection.FIRST_STRONG_ISOLATE: "FSI",
|
|
ECharacterDirection.LEFT_TO_RIGHT: "L",
|
|
ECharacterDirection.LEFT_TO_RIGHT_EMBEDDING: "LRE",
|
|
ECharacterDirection.LEFT_TO_RIGHT_ISOLATE: "LRI",
|
|
ECharacterDirection.LEFT_TO_RIGHT_OVERRIDE: "LRO",
|
|
ECharacterDirection.OTHER_NEUTRAL: "ON",
|
|
ECharacterDirection.POP_DIRECTIONAL_FORMAT: "PDF",
|
|
ECharacterDirection.POP_DIRECTIONAL_ISOLATE: "PDI",
|
|
ECharacterDirection.RIGHT_TO_LEFT: "R",
|
|
ECharacterDirection.RIGHT_TO_LEFT_ARABIC: "AL",
|
|
ECharacterDirection.RIGHT_TO_LEFT_EMBEDDING: "RLE",
|
|
ECharacterDirection.RIGHT_TO_LEFT_ISOLATE: "RLI",
|
|
ECharacterDirection.RIGHT_TO_LEFT_OVERRIDE: "RLO",
|
|
ECharacterDirection.SEGMENT_SEPARATOR: "S",
|
|
ECharacterDirection.WHITE_SPACE_NEUTRAL: "WS"
|
|
}
|
|
|
|
def bidirectional(unichr):
|
|
return _dir[UCharacter.getDirection(_get_codepoint(unichr))]
|
|
|
|
|
|
def combining(unichr):
|
|
return UCharacter.getCombiningClass(_get_codepoint(unichr))
|
|
|
|
|
|
def mirrored(unichr):
|
|
return UCharacter.isMirrored(_get_codepoint(unichr))
|
|
|
|
|
|
_eaw = {
|
|
# http://www.unicode.org/reports/tr11/
|
|
EastAsianWidth.AMBIGUOUS : "A",
|
|
EastAsianWidth.COUNT : "?", # apparently not used, see above TR
|
|
EastAsianWidth.FULLWIDTH : "F",
|
|
EastAsianWidth.HALFWIDTH : "H",
|
|
EastAsianWidth.NARROW : "Na",
|
|
EastAsianWidth.NEUTRAL : "N",
|
|
EastAsianWidth.WIDE : "W"
|
|
}
|
|
|
|
def east_asian_width(unichr):
|
|
return _eaw[UCharacter.getIntPropertyValue(_get_codepoint(unichr), UProperty.EAST_ASIAN_WIDTH)]
|
|
|
|
|
|
def normalize(form, unistr):
|
|
"""
|
|
Return the normal form 'form' for the Unicode string unistr. Valid
|
|
values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
|
"""
|
|
|
|
try:
|
|
normalizer_form = _forms[form]
|
|
except KeyError:
|
|
raise ValueError('invalid normalization form')
|
|
|
|
return Normalizer.normalize(unistr, normalizer_form)
|
|
|
|
|
|
def get_icu_version():
|
|
versions = []
|
|
for k in VersionInfo.__dict__.iterkeys():
|
|
if k.startswith("UNICODE_"):
|
|
v = getattr(VersionInfo, k)
|
|
versions.append((v.getMajor(), v.getMinor(), v.getMilli()))
|
|
return ".".join(str(x) for x in max(versions))
|
|
|
|
|
|
unidata_version = get_icu_version()
|