microproduct/atmosphericDelay/ISCEApp/site-packages/enchant/tokenize/en.py

186 lines
6.9 KiB
Python

# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tokenize.en: Tokenizer for the English language
This module implements a PyEnchant text tokenizer for the English
language, based on very simple rules.
"""
import unicodedata
import enchant.tokenize
class tokenize(enchant.tokenize.tokenize): # noqa: N801
"""Iterator splitting text into words, reporting position.
This iterator takes a text string as input, and yields tuples
representing each distinct word found in the text. The tuples
take the form:
(<word>,<pos>)
Where <word> is the word string found and <pos> is the position
of the start of the word within the text.
The optional argument <valid_chars> may be used to specify a
list of additional characters that can form part of a word.
By default, this list contains only the apostrophe ('). Note that
these characters cannot appear at the start or end of a word.
"""
_DOC_ERRORS = ["pos", "pos"]
def __init__(self, text, valid_chars=None):
self._valid_chars = valid_chars
self._text = text
self._offset = 0
# Select proper implementation of self._consume_alpha.
# 'text' isn't necessarily a string (it could be e.g. a mutable array)
# so we can't use isinstance(text, str) to detect unicode.
# Instead we typetest the first character of the text.
# If there's no characters then it doesn't matter what implementation
# we use since it won't be called anyway.
try:
char1 = text[0]
except IndexError:
self._initialize_for_binary()
else:
if isinstance(char1, str):
self._initialize_for_unicode()
else:
self._initialize_for_binary()
def _initialize_for_binary(self):
self._consume_alpha = self._consume_alpha_b
if self._valid_chars is None:
self._valid_chars = ("'",)
def _initialize_for_unicode(self):
self._consume_alpha = self._consume_alpha_u
if self._valid_chars is None:
# XXX TODO: this doesn't seem to work correctly with the
# MySpell provider, disabling for now.
# Allow unicode typographic apostrophe
# self._valid_chars = (u"'",u"\u2019")
self._valid_chars = ("'",)
def _consume_alpha_b(self, text, offset):
"""Consume an alphabetic character from the given bytestring.
Given a bytestring and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Non-ASCII bytes are interpreted as utf-8 and can
result in multiple characters being consumed.
"""
assert offset < len(text)
if text[offset].isalpha():
return 1
elif text[offset] >= "\x80":
return self._consume_alpha_utf8(text, offset)
return 0
def _consume_alpha_utf8(self, text, offset):
"""Consume a sequence of utf8 bytes forming an alphabetic character."""
incr = 2
u = ""
while not u and incr <= 4:
try:
try:
# In the common case this will be a string
u = text[offset : offset + incr].decode("utf8")
except AttributeError:
# Looks like it was e.g. a mutable char array.
try:
s = text[offset : offset + incr].tostring()
except AttributeError:
s = "".join([c for c in text[offset : offset + incr]])
u = s.decode("utf8")
except UnicodeDecodeError:
incr += 1
if not u:
return 0
if u.isalpha():
return incr
if unicodedata.category(u)[0] == "M":
return incr
return 0
def _consume_alpha_u(self, text, offset):
"""Consume an alphabetic character from the given unicode string.
Given a unicode string and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Trailing combining characters are consumed as a
single letter.
"""
assert offset < len(text)
incr = 0
if text[offset].isalpha():
incr = 1
while offset + incr < len(text):
if unicodedata.category(text[offset + incr])[0] != "M":
break
incr += 1
return incr
def next(self):
text = self._text
offset = self._offset
while offset < len(text):
# Find start of next word (must be alpha)
while offset < len(text):
incr = self._consume_alpha(text, offset)
if incr:
break
offset += 1
cur_pos = offset
# Find end of word using, allowing valid_chars
while offset < len(text):
incr = self._consume_alpha(text, offset)
if not incr:
if text[offset] in self._valid_chars:
incr = 1
else:
break
offset += incr
# Return if word isn't empty
if cur_pos != offset:
# Make sure word doesn't end with a valid_char
while text[offset - 1] in self._valid_chars:
offset = offset - 1
self._offset = offset
return (text[cur_pos:offset], cur_pos)
self._offset = offset
raise StopIteration()