Source code for nlpretext.basic.preprocess

# coding=utf-8
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import re
import unicodedata
from ftfy import fix_text as _fix_text
from nlpretext._config import constants
from nlpretext.token.tokenizer import tokenize
from nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers
from nlpretext._utils.stopwords import get_stopwords


[docs]def normalize_whitespace(text) -> str: """ Given ``text`` str, replace one or more spacings with a single space, and one or more linebreaks with a single newline. Also strip leading/trailing whitespace. eg. " foo bar " -> "foo bar" Parameters ---------- text : string Returns ------- string """ text = constants.NONBREAKING_SPACE_REGEX.sub( " ", constants.LINEBREAK_REGEX.sub(r"\n", text) ).strip() return text
[docs]def lower_text(text: str): """ Given ``text`` str, transform it into lowercase Parameters ---------- text : string Returns ------- string """ return text.lower()
[docs]def remove_stopwords(text: str, lang: str, custom_stopwords: list = None) -> str: """ Given ``text`` str, remove classic stopwords for a given language and custom stopwords given as a list. Parameters ---------- text : string lang : string custom_stopwords : list of strings Returns ------- string """ stopwords = get_stopwords(lang) if custom_stopwords: stopwords += custom_stopwords if lang in ["fr", "en"]: lang_module = { "fr" : "fr_spacy", "en" : "en_spacy" }[lang] return ' '.join( [x for x in tokenize(text, lang_module) if x not in stopwords]) return ' '.join([x for x in text.split() if x not in stopwords])
[docs]def remove_eol_characters(text) -> str: """ Remove end of line (\n) char. Parameters ---------- text : str Returns ------- str """ text = text.replace("\n", " ") return text
[docs]def fix_bad_unicode(text, normalization: str = "NFC") -> str: """ Fix unicode text that's "broken" using `ftfy <http://ftfy.readthedocs.org/>`_; this includes mojibake, HTML entities and other code cruft, and non-standard forms for display purposes. Parameters ---------- text : string normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC', combines characters and diacritics written using separate code points, e.g. converting "e" plus an acute accent modifier into "é"; unicode can be converted to NFC form without any change in its meaning! if 'NFKC', additional normalizations are applied that can change the meanings of characters, e.g. ellipsis characters will be replaced with three periods Returns ------- string """ text = _fix_text(text, normalization=normalization) return text
[docs]def unpack_english_contractions(text) -> str: """ Replace *English* contractions in ``text`` str with their unshortened forms. N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), so are left as-is. eg. "You're fired. She's nice." -> "You are fired. She's nice." Parameters ---------- text : string Returns ------- string """ # standard text = constants.CONTRACTION_NT_NOT.sub( r"\1\2 not", text, ) text = constants.CONTRACTION_LL_WILL.sub( r"\1\2 will", text, ) text = constants.CONTRACTION_RE_ARE.sub(r"\1\2 are", text) text = constants.CONTRACTION_VE_HAVE.sub( r"\1\2 have", text, ) text = constants.CONTRACTION_CANT_CANNOT.sub(r"\1\2n not", text) text = constants.CONTRACTION_M_AM.sub(r"\1\2 am", text) text = constants.CONTRACTION_LET_LETUS.sub(r"\1\2 us", text) text = constants.CONTRACTION_WONT_WILLNOT.sub(r"\1\2ill not", text) text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r"\1\2hall not", text) text = constants.CONTRACTION_YALL_YOUALL.sub(r"\1\2ou all", text) return text
[docs]def replace_urls(text, replace_with: str = "*URL*") -> str: """ Replace all URLs in ``text`` str with ``replace_with`` str. Parameters ---------- text : string replace_with : string the string you want the URL to be replaced with. Returns ------- string """ text = constants.URL_REGEX.sub( replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text) ) return text
[docs]def replace_emails(text, replace_with="*EMAIL*") -> str: """ Replace all emails in ``text`` str with ``replace_with`` str Parameters ---------- text : string replace_with : string the string you want the email address to be replaced with. Returns ------- string """ text = constants.EMAIL_REGEX.sub(replace_with, text) return text
[docs]def replace_phone_numbers(text, country_to_detect: list, replace_with: str = "*PHONE*", method: str = "regex") -> str: """ Replace all phone numbers in ``text`` str with ``replace_with`` str Parameters ---------- text : string replace_with : string the string you want the phone number to be replaced with. method : ['regex','detection'] regex is faster but will omit a lot of numbers, while detection will catch every numbers, but takes a while. country_to_detect : list If a list of country code is specified, will catch every number formatted. Only when method = 'detection'. Returns ------- string """ if method == 'regex': text = constants.PHONE_REGEX.sub(replace_with, text) elif method == 'detection': found_nums = _extract_phone_numbers(text, countrylist=country_to_detect) # order by lenght to avoid truncated numbers to be removed first. found_nums.sort(key=len, reverse=True) for phone_number in found_nums: text = text.replace(phone_number, replace_with) else: raise ValueError('Please input a valid method between "regex" or \ "detection"') return text
[docs]def replace_numbers(text, replace_with="*NUMBER*") -> str: """ Replace all numbers in ``text`` str with ``replace_with`` str. Parameters ---------- text : string replace_with : string the string you want the number to be replaced with. Returns ------- string """ text = constants.NUMBERS_REGEX.sub(replace_with, text) return text
[docs]def replace_currency_symbols(text, replace_with=None) -> str: """ Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str. Parameters ---------- text : str raw text replace_with : None or string if None (default), replace symbols with their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP'); otherwise, pass in a string with which to replace all symbols (e.g. "*CURRENCY*") Returns ------- string """ if replace_with is None: for k, v in constants.CURRENCIES.items(): text = text.replace(k, v) else: text = constants.CURRENCY_REGEX.sub(replace_with, text) return text
[docs]def remove_punct(text, marks=None) -> str: """ Remove punctuation from ``text`` by replacing all instances of ``marks`` with whitespace. Parameters ---------- text : str raw text marks : str or None If specified, remove only the characters in this string, e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. Otherwise, all punctuation marks are removed. Returns ------- string Note ------- When ``marks=None``, Python's built-in :meth:`str.translate()` is used to remove punctuation; otherwise, a regular expression is used instead. The former's performance is about 5-10x faster. """ if marks: text = re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE) else: text = text.translate(constants.PUNCT_TRANSLATE_UNICODE) return text
[docs]def remove_accents(text, method: str = "unicode") -> str: """ Remove accents from any accented unicode characters in ``text`` str, either by transforming them into ascii equivalents or removing them entirely. Parameters ---------- text : str raw text method : ({'unicode', 'ascii'}) if 'unicode', remove accented char for any unicode symbol with a direct ASCII equivalent; if 'ascii', remove accented char for any unicode symbol NB: the 'ascii' method is notably faster than 'unicode', but less good Returns ------- string Raises ------- ValueError if ``method`` is not in {'unicode', 'ascii'} """ if method == "unicode": text = "".join( c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c) ) elif method == "ascii": text = ( unicodedata.normalize("NFKD", text) .encode("ascii", errors="ignore") .decode("ascii") ) else: msg = '`method` must be either "unicode" and "ascii", not {}' \ .format(method) raise ValueError(msg) return text
[docs]def remove_multiple_spaces_and_strip_text(text) -> str: """ Remove multiple spaces, strip text, and remove '-', '*' characters. Parameters ---------- text : str the text to be processed Returns ------- string the text with removed multiple spaces and strip text """ regex_remove_multiple_spaces_list = ["\\t", "[\\s\\-\\*]{2,}"] for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list: text = re.sub(regex_remove_multiple_spaces, " ", text) text = text.strip() return text
[docs]def filter_non_latin_characters(text) -> str: """ Function that filters non latin characters of a text Parameters ---------- text : string Returns ------- string """ text = constants.LATIN_CHARACTERS_RE.sub(' ', text) text = normalize_whitespace(text) return text