Source code for putput.presets.stochastic

import random
from functools import partial
from operator import itemgetter
from typing import Any
from typing import Callable
from typing import List
from typing import Mapping
from typing import Sequence
from typing import Set
from typing import Tuple

import nltk

WORDNET_POS_TAGS = set()  # type: Set[str]
wordnet = None  # type: nltk.corpus.util.LazyCorpusLoader

[docs]def preset(*, chance: int = 20) -> Callable:
    """Randomly replaces words with synonyms from wordnet synsets.

    Tags each word in the utterance with nltk's part of speech tagger. Using
    the part of speech, each word in the utterance is replaced with a randomly
    chosen word from the first synset with the same part of speech as the word
    to replace, subject to the specified chance. If no synset exists with the
    same part of speech, the original word will not be replaced.

    Downloads nltk's wordnet, punkt, and averaged_perceptron_tagger if non-existent
    on the host.

    Args:
        chance: The chance between [0, 100] for each word to be replaced by
            a synonym.

    Returns:
        A Callable that when called returns parameters for instantiating a Pipeline.
        This Callable can be passed into putput.Pipeline as the 'preset' argument.

    Examples:
        >>> from pathlib import Path
        >>> from putput.pipeline import Pipeline
        >>> pattern_def_path = Path(__file__).parent.parent.parent / 'tests' / 'doc' / 'example_pattern_definition.yml'
        >>> dynamic_token_patterns_map = {'ITEM': ('fries',)}
        >>> p = Pipeline.from_preset(preset(chance=100),
        ...                          pattern_def_path,
        ...                          dynamic_token_patterns_map=dynamic_token_patterns_map,
        ...                          seed=0)
        >>> generator = p.flow(disable_progress_bar=True)
        >>> for utterance, tokens, groups in generator:
        ...     print(utterance)
        ...     print(tokens)
        ...     print(groups)
        ...     break
        can she acquire chips can she acquire french-fried_potatoes and french_fries
        ('[ADD(can she acquire)]', '[ITEM(chips)]',
         '[ADD(can she acquire)]', '[ITEM(french-fried_potatoes)]',
         '[CONJUNCTION(and)]', '[ITEM(french_fries)]')
        ('{[ADD(can she acquire)] [ITEM(chips)]}',
         '{[ADD(can she acquire)] [ITEM(french-fried_potatoes)]}',
         '{[CONJUNCTION(and)]}', '{[ITEM(french_fries)]}')
    """
    if chance not in range(101):
        raise ValueError('Invalid chance: {}. Chance accepts any integer between [0, 100]')
    _init_nltk()
    return partial(_preset, chance=chance)


def _preset(chance: int, **kwargs: Any) -> Mapping: # pylint: disable=W0613
    combo_hooks_map = {
        'DEFAULT': (partial(_replace_with_synonyms, chance=chance),)
    }
    return {
        'combo_hooks_map': combo_hooks_map
    }


def _init_nltk() -> None:
    global WORDNET_POS_TAGS  # pylint: disable=global-statement
    global wordnet  # pylint: disable=global-statement
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    from nltk.corpus import wordnet  # pylint: disable=redefined-outer-name
    WORDNET_POS_TAGS = {wordnet.ADJ, wordnet.VERB, wordnet.NOUN, wordnet.ADV}


def _replace_with_synonyms(utterance: str,
                           handled_tokens: Sequence[str],
                           handled_groups: Sequence[str],
                           chance: int
                           ) -> Tuple[str, Sequence[str], Sequence[str]]:
    _, _ = handled_tokens, handled_groups
    pos = _pos_tag_for_wordnet(utterance)
    return _replace_utterance_tokens_groups_with_synonyms(handled_groups, pos, chance)


def _replace_utterance_tokens_groups_with_synonyms(handled_groups: Sequence[str],
                                                   pos: Sequence[str],
                                                   chance: int
                                                   ) -> Tuple[str, Sequence[str], Sequence[str]]:
    pos_position = 0
    synonym_utterances, synonym_tokens, synonym_groups = [], [], [] # type: List[str], List[str], List[str]
    for handled_group in handled_groups:
        syn_utterance_components, syn_token_components, pos_position = _replace_components_with_synonyms(handled_group,
                                                                                                         pos,
                                                                                                         pos_position,
                                                                                                         chance)
        synonym_utterance_component = ' '.join(syn_utterance_components)
        synonym_utterances.append(synonym_utterance_component)

        synonym_token_components = ['[{}({})]'.format(t, u)
                                    for u, t in zip(syn_utterance_components, syn_token_components)]
        synonym_tokens += synonym_token_components

        synonym_group_component = '{{{}}}'.format(' '.join(synonym_token_components))
        synonym_groups.append(synonym_group_component)
    return ' '.join(synonym_utterances), tuple(synonym_tokens), tuple(synonym_groups)


def _replace_components_with_synonyms(handled_group: str,
                                      pos: Sequence[str],
                                      pos_position: int,
                                      chance: int
                                      ) -> Tuple[Sequence[str], Sequence[str], int]:
    num_parens = 0
    syn_utterance_components, syn_token_components = [], [] # type: List[str], List[str]
    for position, char in enumerate(handled_group):
        if char == '[':
            start_tokens_index = position + 1
        if char == '(':
            num_parens += 1
            if num_parens == 2:
                syn_token_components.append(handled_group[start_tokens_index:position])
                start_utterance_index = position + 1
        if char == ')':
            num_parens -= 1
            if num_parens == 1:
                handled_original_utterance_component = handled_group[start_utterance_index:position]
                handled_utterance_component_words = handled_original_utterance_component.split()
                for i, word in enumerate(handled_utterance_component_words):
                    if random.random() < (chance / 100) and pos[pos_position] in WORDNET_POS_TAGS:
                        handled_utterance_component_words[i] = _get_synonym(word, pos[pos_position])
                    pos_position += 1
                syn_utterance_components.append(' '.join(handled_utterance_component_words))
    return syn_utterance_components, syn_token_components, pos_position


def _get_wordnet_pos(tag: str) -> str:
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    return ''


def _pos_tag_for_wordnet(utterance: str) -> Tuple[str, ...]:
    tags = nltk.pos_tag(nltk.word_tokenize(utterance))
    return tuple(map(_get_wordnet_pos, tuple(map(itemgetter(1), tags))))


def _get_synonym(word: str, tag: str) -> str:
    synsets = wordnet.synsets(word)
    for synset in synsets:
        if synset.pos() == tag:
            synonym = random.choice(synset.lemma_names())
            if synonym:
                word = synonym
                break
    return word