diff --git a/nlp.py b/nlp.py index f136cb035..c31af521b 100644 --- a/nlp.py +++ b/nlp.py @@ -7,6 +7,104 @@ import urllib.request import re +# ______________________________________________________________________________ +# N-gram + +class NGram: + """This class describe an n-gram model for a range of text. + An NGram is specified by the following inputs: + texts An array of text that will be used to create the n-gram + model + n The size of the n-gram pair that will be generated. + characters Boolean indicating if the n-gram will considers characters + or words + """ + + def __init__(self, texts, n, characters=False): + self.texts = texts + self.n = n + self.characters = characters + self.ngrams = {} + self.num_ngrams = 0 + + """If ngram is not present, return a small probability for it""" + self.probability_ngrams = defaultdict(lambda: 0.00000000001) + + """regex to remove special characters from text""" + self.special_characters_regex = re.compile('[^0-9a-zA-Z\s]+') + + + + def create_ngrams(self): + for text in self.texts: + self.count_ngrams(text) + + for value in self.ngrams.values(): + self.num_ngrams += value + + def populate_ngram(self, token): + size = len(token) + + if size < self.n: + return self.ngrams + + for i in range(size - self.n + 1): + word_sequence = tuple(token[i:i + self.n]) + value = self.ngrams.setdefault(word_sequence, 0) + self.ngrams[word_sequence] = value + 1 + + def create_tokens(self, text): + """Remove start and ending spaces""" + text = text.strip() + + """Remove special characters from text""" + text = self.special_characters_regex.sub('', text) + + """Guarantee that all text string are on lower case""" + text = text.lower() + + """Create tokens""" + tokens = text.split(' ') + + """Remove empty tokens""" + tokens = list(filter(None, tokens)) + + """Verify if any token has some unecessary empty space""" + for token in tokens: + token = token.strip() + + return tokens + + def count_ngrams(self, text): + tokens = self.create_tokens(text) + + if self.characters: + for token in tokens: + self.populate_ngram(token) + else: + self.populate_ngram(tokens) + + return self.ngrams + + def generate_probability(self): + for key, value in self.ngrams.items(): + self.probability_ngrams[key] = value / self.num_ngrams + + +class Unigram(NGram): + def __init__(self, texts, characters=False): + super().__init__(texts, n=1, characters=characters) + + +class Bigram(NGram): + def __init__(self, texts, characters=False): + super().__init__(texts, n=2, characters=characters) + + +class Trigram(NGram): + def __init__(self, texts, characters=False): + super().__init__(texts, n=3, characters=characters) + # ______________________________________________________________________________ # Grammars and Lexicons diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 43f71f163..d13c59359 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -4,10 +4,89 @@ from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks from nlp import getOutlinks, Page from nlp import Rules, Lexicon +from nlp import Unigram, Bigram, Trigram # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by function's within nlp.py +def test_ngram_character_count(): + text_string = 'I like programming' + + unigram = Unigram([], characters=True) + ngrams = unigram.count_ngrams(text_string) + expected_ngrams = {('l',): 1, ('i',): 3, ('k',): 1, ('e',): 1, + ('p',): 1, ('r',): 2, ('o',): 1, ('g',): 2, ('a',): 1, ('m',): 2, + ('n',): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + assert ngrams[key] == value + + bigram = Bigram([], characters=True) + ngrams = bigram.count_ngrams(text_string) + expected_ngrams = {('l', 'i'): 1, ('i', 'k'): 1, ('k', 'e'): 1, ('p', 'r'): 1, + ('r', 'o'): 1, ('o', 'g'): 1, ('g', 'r'): 1, ('r', 'a'): 1, ('a', 'm'): 1, + ('m', 'm'): 1, ('m', 'i'): 1, ('i', 'n'): 1, ('n', 'g'): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + + trigram = Trigram([], characters=True) + ngrams = trigram.count_ngrams(text_string) + expected_ngrams = {('l', 'i', 'k'): 1, ('i', 'k', 'e'): 1, ('p', 'r', 'o'): 1, + ('r', 'o', 'g'): 1, ('o', 'g', 'r'): 1, ('g', 'r', 'a'): 1, + ('g', 'r', 'a'): 1, ('r', 'a', 'm'): 1, ('a', 'm', 'm'): 1, + ('m', 'm', 'i'): 1, ('m', 'i', 'n'): 1, ('i', 'n', 'g'): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + assert ngrams[key] == value + assert ngrams[key] == value + + +def test_ngram_word_count(): + text_string = "I like learning about IA" + + unigram = Unigram([]) + ngrams = unigram.count_ngrams(text_string) + expected_ngrams = {('i',): 1, ('like',): 1, ('learning',): 1, + ('about',): 1, ('ia',): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + assert ngrams[key] == value + + ngram = Bigram([]) + ngrams = ngram.count_ngrams(text_string) + expected_ngrams = {('i', 'like'): 1, ('like', 'learning'): 1, ('learning', 'about'): 1, + ('about', 'ia'): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + assert ngrams[key] == value + + ngram = Trigram([]) + ngrams = ngram.count_ngrams(text_string) + expected_ngrams = {('i', 'like', 'learning'): 1, ('like', 'learning', 'about'): 1, + ('learning', 'about', 'ia'): 1} + + assert len(expected_ngrams) == len(ngrams) + + for key, value in expected_ngrams.items(): + assert key in ngrams + assert ngrams[key] == value + + def test_rules(): assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}