Source code for putput.presets.stochastic

import random
from functools import partial
from operator import itemgetter
from typing import Any
from typing import Callable
from typing import List
from typing import Mapping
from typing import Sequence
from typing import Set
from typing import Tuple

import nltk

WORDNET_POS_TAGS = set()  # type: Set[str]
wordnet = None  # type: nltk.corpus.util.LazyCorpusLoader

[docs]def preset(*, chance: int = 20) -> Callable: """Randomly replaces words with synonyms from wordnet synsets. Tags each word in the utterance with nltk's part of speech tagger. Using the part of speech, each word in the utterance is replaced with a randomly chosen word from the first synset with the same part of speech as the word to replace, subject to the specified chance. If no synset exists with the same part of speech, the original word will not be replaced. Downloads nltk's wordnet, punkt, and averaged_perceptron_tagger if non-existent on the host. Args: chance: The chance between [0, 100] for each word to be replaced by a synonym. Returns: A Callable that when called returns parameters for instantiating a Pipeline. This Callable can be passed into putput.Pipeline as the 'preset' argument. Examples: >>> from pathlib import Path >>> from putput.pipeline import Pipeline >>> pattern_def_path = Path(__file__).parent.parent.parent / 'tests' / 'doc' / 'example_pattern_definition.yml' >>> dynamic_token_patterns_map = {'ITEM': ('fries',)} >>> p = Pipeline.from_preset(preset(chance=100), ... pattern_def_path, ... dynamic_token_patterns_map=dynamic_token_patterns_map, ... seed=0) >>> generator = p.flow(disable_progress_bar=True) >>> for utterance, tokens, groups in generator: ... print(utterance) ... print(tokens) ... print(groups) ... break can she acquire chips can she acquire french-fried_potatoes and french_fries ('[ADD(can she acquire)]', '[ITEM(chips)]', '[ADD(can she acquire)]', '[ITEM(french-fried_potatoes)]', '[CONJUNCTION(and)]', '[ITEM(french_fries)]') ('{[ADD(can she acquire)] [ITEM(chips)]}', '{[ADD(can she acquire)] [ITEM(french-fried_potatoes)]}', '{[CONJUNCTION(and)]}', '{[ITEM(french_fries)]}') """ if chance not in range(101): raise ValueError('Invalid chance: {}. Chance accepts any integer between [0, 100]') _init_nltk() return partial(_preset, chance=chance)
def _preset(chance: int, **kwargs: Any) -> Mapping: # pylint: disable=W0613 combo_hooks_map = { 'DEFAULT': (partial(_replace_with_synonyms, chance=chance),) } return { 'combo_hooks_map': combo_hooks_map } def _init_nltk() -> None: global WORDNET_POS_TAGS # pylint: disable=global-statement global wordnet # pylint: disable=global-statement nltk.download('wordnet') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') from nltk.corpus import wordnet # pylint: disable=redefined-outer-name WORDNET_POS_TAGS = {wordnet.ADJ, wordnet.VERB, wordnet.NOUN, wordnet.ADV} def _replace_with_synonyms(utterance: str, handled_tokens: Sequence[str], handled_groups: Sequence[str], chance: int ) -> Tuple[str, Sequence[str], Sequence[str]]: _, _ = handled_tokens, handled_groups pos = _pos_tag_for_wordnet(utterance) return _replace_utterance_tokens_groups_with_synonyms(handled_groups, pos, chance) def _replace_utterance_tokens_groups_with_synonyms(handled_groups: Sequence[str], pos: Sequence[str], chance: int ) -> Tuple[str, Sequence[str], Sequence[str]]: pos_position = 0 synonym_utterances, synonym_tokens, synonym_groups = [], [], [] # type: List[str], List[str], List[str] for handled_group in handled_groups: syn_utterance_components, syn_token_components, pos_position = _replace_components_with_synonyms(handled_group, pos, pos_position, chance) synonym_utterance_component = ' '.join(syn_utterance_components) synonym_utterances.append(synonym_utterance_component) synonym_token_components = ['[{}({})]'.format(t, u) for u, t in zip(syn_utterance_components, syn_token_components)] synonym_tokens += synonym_token_components synonym_group_component = '{{{}}}'.format(' '.join(synonym_token_components)) synonym_groups.append(synonym_group_component) return ' '.join(synonym_utterances), tuple(synonym_tokens), tuple(synonym_groups) def _replace_components_with_synonyms(handled_group: str, pos: Sequence[str], pos_position: int, chance: int ) -> Tuple[Sequence[str], Sequence[str], int]: num_parens = 0 syn_utterance_components, syn_token_components = [], [] # type: List[str], List[str] for position, char in enumerate(handled_group): if char == '[': start_tokens_index = position + 1 if char == '(': num_parens += 1 if num_parens == 2: syn_token_components.append(handled_group[start_tokens_index:position]) start_utterance_index = position + 1 if char == ')': num_parens -= 1 if num_parens == 1: handled_original_utterance_component = handled_group[start_utterance_index:position] handled_utterance_component_words = handled_original_utterance_component.split() for i, word in enumerate(handled_utterance_component_words): if random.random() < (chance / 100) and pos[pos_position] in WORDNET_POS_TAGS: handled_utterance_component_words[i] = _get_synonym(word, pos[pos_position]) pos_position += 1 syn_utterance_components.append(' '.join(handled_utterance_component_words)) return syn_utterance_components, syn_token_components, pos_position def _get_wordnet_pos(tag: str) -> str: if tag.startswith('J'): return wordnet.ADJ if tag.startswith('V'): return wordnet.VERB if tag.startswith('N'): return wordnet.NOUN if tag.startswith('R'): return wordnet.ADV return '' def _pos_tag_for_wordnet(utterance: str) -> Tuple[str, ...]: tags = nltk.pos_tag(nltk.word_tokenize(utterance)) return tuple(map(_get_wordnet_pos, tuple(map(itemgetter(1), tags)))) def _get_synonym(word: str, tag: str) -> str: synsets = wordnet.synsets(word) for synset in synsets: if synset.pos() == tag: synonym = random.choice(synset.lemma_names()) if synonym: word = synonym break return word