# pyenchant # # Copyright (C) 2004-2008, Ryan Kelly # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # In addition, as a special exception, you are # given permission to link the code of this program with # non-LGPL Spelling Provider libraries (eg: a MSFT Office # spell checker backend) and distribute linked combinations including # the two. You must obey the GNU Lesser General Public License in all # respects for all of the code used other than said providers. If you modify # this file, you may extend this exception to your version of the # file, but you are not obligated to do so. If you do not wish to # do so, delete this exception statement from your version. # """ enchant.tokenize.en: Tokenizer for the English language This module implements a PyEnchant text tokenizer for the English language, based on very simple rules. """ import unicodedata import enchant.tokenize class tokenize(enchant.tokenize.tokenize): # noqa: N801 """Iterator splitting text into words, reporting position. This iterator takes a text string as input, and yields tuples representing each distinct word found in the text. The tuples take the form: (,) Where is the word string found and is the position of the start of the word within the text. The optional argument may be used to specify a list of additional characters that can form part of a word. By default, this list contains only the apostrophe ('). Note that these characters cannot appear at the start or end of a word. """ _DOC_ERRORS = ["pos", "pos"] def __init__(self, text, valid_chars=None): self._valid_chars = valid_chars self._text = text self._offset = 0 # Select proper implementation of self._consume_alpha. # 'text' isn't necessarily a string (it could be e.g. a mutable array) # so we can't use isinstance(text, str) to detect unicode. # Instead we typetest the first character of the text. # If there's no characters then it doesn't matter what implementation # we use since it won't be called anyway. try: char1 = text[0] except IndexError: self._initialize_for_binary() else: if isinstance(char1, str): self._initialize_for_unicode() else: self._initialize_for_binary() def _initialize_for_binary(self): self._consume_alpha = self._consume_alpha_b if self._valid_chars is None: self._valid_chars = ("'",) def _initialize_for_unicode(self): self._consume_alpha = self._consume_alpha_u if self._valid_chars is None: # XXX TODO: this doesn't seem to work correctly with the # MySpell provider, disabling for now. # Allow unicode typographic apostrophe # self._valid_chars = (u"'",u"\u2019") self._valid_chars = ("'",) def _consume_alpha_b(self, text, offset): """Consume an alphabetic character from the given bytestring. Given a bytestring and the current offset, this method returns the number of characters occupied by the next alphabetic character in the string. Non-ASCII bytes are interpreted as utf-8 and can result in multiple characters being consumed. """ assert offset < len(text) if text[offset].isalpha(): return 1 elif text[offset] >= "\x80": return self._consume_alpha_utf8(text, offset) return 0 def _consume_alpha_utf8(self, text, offset): """Consume a sequence of utf8 bytes forming an alphabetic character.""" incr = 2 u = "" while not u and incr <= 4: try: try: # In the common case this will be a string u = text[offset : offset + incr].decode("utf8") except AttributeError: # Looks like it was e.g. a mutable char array. try: s = text[offset : offset + incr].tostring() except AttributeError: s = "".join([c for c in text[offset : offset + incr]]) u = s.decode("utf8") except UnicodeDecodeError: incr += 1 if not u: return 0 if u.isalpha(): return incr if unicodedata.category(u)[0] == "M": return incr return 0 def _consume_alpha_u(self, text, offset): """Consume an alphabetic character from the given unicode string. Given a unicode string and the current offset, this method returns the number of characters occupied by the next alphabetic character in the string. Trailing combining characters are consumed as a single letter. """ assert offset < len(text) incr = 0 if text[offset].isalpha(): incr = 1 while offset + incr < len(text): if unicodedata.category(text[offset + incr])[0] != "M": break incr += 1 return incr def next(self): text = self._text offset = self._offset while offset < len(text): # Find start of next word (must be alpha) while offset < len(text): incr = self._consume_alpha(text, offset) if incr: break offset += 1 cur_pos = offset # Find end of word using, allowing valid_chars while offset < len(text): incr = self._consume_alpha(text, offset) if not incr: if text[offset] in self._valid_chars: incr = 1 else: break offset += incr # Return if word isn't empty if cur_pos != offset: # Make sure word doesn't end with a valid_char while text[offset - 1] in self._valid_chars: offset = offset - 1 self._offset = offset return (text[cur_pos:offset], cur_pos) self._offset = offset raise StopIteration()