Source code for nlpretext.social.preprocess

# coding=utf-8
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

from __future__ import absolute_import, division, print_function, unicode_literals

import emoji as _emoji
from nlpretext._config import constants
from nlpretext.basic.preprocess import normalize_whitespace


[docs]def remove_mentions(text) -> str:
    """
    Function that removes words preceded with a '@'

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    text = normalize_whitespace(constants.AT_PATTERN.sub('', text))
    return text


[docs]def extract_mentions(text) -> list:
    """
    Function that extracts words preceded with a '@'
    eg. "I take care of my skin with @thisproduct" --> ["@thisproduct"]

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    return constants.AT_PATTERN.findall(text)


[docs]def remove_html_tags(text) -> str:
    """
    Function that removes words between < and >

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub('', text))
    return text


[docs]def remove_emoji(text) -> str:
    """
    Remove emoji from any str by stripping any unicode in the range of Emoji unicode
    as defined in the unicode convention:
    http://www.unicode.org/emoji/charts/full-emoji-list.html

    Parameters
    ----------
    text : str

    Returns
    -------
    str
    """
    text = constants.EMOJI_PATTERN.sub("", text)
    return text


[docs]def convert_emoji_to_text(text, code_delimiters=(':', ':')) -> str:
    """
    Convert emoji to their CLDR Short Name, according to the unicode convention
    http://www.unicode.org/emoji/charts/full-emoji-list.html
    eg. 😀 --> :grinning_face:

    Parameters
    ----------
    text : str
    code_delimiters : tuple of symbols around the emoji code.
    eg: (':',':') --> :grinning_face:

    Returns
    -------
    str
        string
    """
    return _emoji.demojize(text, delimiters=code_delimiters)


[docs]def extract_emojis(text) -> list:
    """
    Function that extracts emojis from a text and translates them into words
    eg. "I take care of my skin 😀 :(" --> [":grinning_face:"]

    Parameters
    ----------
    text : str

    Returns
    -------
    list
        list of all emojis converted with their unicode conventions
    """
    emojis_in_text = constants.EMOJI_PATTERN.findall(text)
    emojis_converted = [convert_emoji_to_text(emoji_text) for emoji_text in emojis_in_text]
    return emojis_converted


[docs]def extract_hashtags(text) -> list:
    """
    Function that extracts words preceded with a '#'
    eg. "I take care of my skin #selfcare#selfestim" --> ["skincare", "selfestim"]

    Parameters
    ----------
    text : str

    Returns
    -------
    list
        list of all hashtags
    """
    return constants.HASHTAG_PATTERN.findall(text)


[docs]def remove_hashtag(text) -> str:
    """
    Function that removes words preceded with a '#'
    eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin"

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        text of a post without hashtags
    """
    text = normalize_whitespace(constants.HASHTAG_PATTERN.sub('', text))
    return text