aimacode · lucasmoura · Mar 13, 2017
diff --git a/nlp.py b/nlp.py
@@ -7,6 +7,104 @@
 import urllib.request
 import re

+# ______________________________________________________________________________
+# N-gram
+
+class NGram:
+    """This class describe an n-gram model for a range of text.
+    An NGram is specified by the following inputs:
+        texts       An array of text that will be used to create the n-gram
+                    model
+        n           The size of the n-gram pair that will be generated.
+        characters  Boolean indicating if the n-gram will considers characters
+                    or words
+    """
+
+    def __init__(self, texts, n, characters=False):
+        self.texts = texts
+        self.n = n
+        self.characters = characters
+        self.ngrams = {}
+        self.num_ngrams = 0
+
+        """If ngram is not present, return a small probability for it"""
+        self.probability_ngrams = defaultdict(lambda: 0.00000000001)
+
+        """regex to remove special characters from text"""
+        self.special_characters_regex = re.compile('[^0-9a-zA-Z\s]+')
+
+
+
+    def create_ngrams(self):
+        for text in self.texts:
+            self.count_ngrams(text)
+
+        for value in self.ngrams.values():
+            self.num_ngrams += value
+
+    def populate_ngram(self, token):
+        size = len(token)
+
+        if size < self.n:
+            return self.ngrams
+
+        for i in range(size - self.n + 1):
+            word_sequence = tuple(token[i:i + self.n])
+            value = self.ngrams.setdefault(word_sequence, 0)
+            self.ngrams[word_sequence] = value + 1
+
+    def create_tokens(self, text):
+        """Remove start and ending spaces"""
+        text = text.strip()
+
+        """Remove special characters from text"""
+        text = self.special_characters_regex.sub('', text)
+
+        """Guarantee that all text string are on lower case"""
+        text = text.lower()
+
+        """Create tokens"""
+        tokens = text.split(' ')
+
+        """Remove empty tokens"""
+        tokens = list(filter(None, tokens))
+
+        """Verify if any token has some unecessary empty space"""
+        for token in tokens:
+            token = token.strip()
+
+        return tokens
+
+    def count_ngrams(self, text):
+        tokens = self.create_tokens(text)
+
+        if self.characters:
+            for token in tokens:
+                self.populate_ngram(token)
+        else:
+            self.populate_ngram(tokens)
+
+        return self.ngrams
+
+    def generate_probability(self):
+        for key, value in self.ngrams.items():
+            self.probability_ngrams[key] = value / self.num_ngrams
+
+
+class Unigram(NGram):
+    def __init__(self, texts, characters=False):
+        super().__init__(texts, n=1, characters=characters)
+
+
+class Bigram(NGram):
+    def __init__(self, texts, characters=False):
+        super().__init__(texts, n=2, characters=characters)
+
+
+class Trigram(NGram):
+    def __init__(self, texts, characters=False):
+        super().__init__(texts, n=3, characters=characters)
+
 # ______________________________________________________________________________
 # Grammars and Lexicons


diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -4,10 +4,89 @@
 from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
 from nlp import getOutlinks, Page
 from nlp import Rules, Lexicon
+from nlp import Unigram, Bigram, Trigram
 # Clumsy imports because we want to access certain nlp.py globals explicitly, because
 # they are accessed by function's within nlp.py


+def test_ngram_character_count():
+    text_string = 'I like programming'
+
+    unigram = Unigram([], characters=True)
+    ngrams = unigram.count_ngrams(text_string)
+    expected_ngrams = {('l',): 1, ('i',): 3, ('k',): 1, ('e',): 1,
+                       ('p',): 1, ('r',): 2, ('o',): 1, ('g',): 2, ('a',): 1, ('m',): 2,
+                       ('n',): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+        assert ngrams[key] == value
+
+    bigram = Bigram([], characters=True)
+    ngrams = bigram.count_ngrams(text_string)
+    expected_ngrams = {('l', 'i'): 1, ('i', 'k'): 1, ('k', 'e'): 1, ('p', 'r'): 1,
+                       ('r', 'o'): 1, ('o', 'g'): 1, ('g', 'r'): 1, ('r', 'a'): 1, ('a', 'm'): 1,
+                       ('m', 'm'): 1, ('m', 'i'): 1, ('i', 'n'): 1, ('n', 'g'): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+
+    trigram = Trigram([], characters=True)
+    ngrams = trigram.count_ngrams(text_string)
+    expected_ngrams = {('l', 'i', 'k'): 1, ('i', 'k', 'e'): 1, ('p', 'r', 'o'): 1,
+                       ('r', 'o', 'g'): 1, ('o', 'g', 'r'): 1, ('g', 'r', 'a'): 1,
+                       ('g', 'r', 'a'): 1, ('r', 'a', 'm'): 1, ('a', 'm', 'm'): 1,
+                       ('m', 'm', 'i'): 1, ('m', 'i', 'n'): 1, ('i', 'n', 'g'): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+        assert ngrams[key] == value
+        assert ngrams[key] == value
+
+
+def test_ngram_word_count():
+    text_string = "I like learning about IA"
+
+    unigram = Unigram([])
+    ngrams = unigram.count_ngrams(text_string)
+    expected_ngrams = {('i',): 1, ('like',): 1, ('learning',): 1,
+                       ('about',): 1, ('ia',): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+        assert ngrams[key] == value
+
+    ngram = Bigram([])
+    ngrams = ngram.count_ngrams(text_string)
+    expected_ngrams = {('i', 'like'): 1, ('like', 'learning'): 1, ('learning', 'about'): 1,
+                       ('about', 'ia'): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+        assert ngrams[key] == value
+
+    ngram = Trigram([])
+    ngrams = ngram.count_ngrams(text_string)
+    expected_ngrams = {('i', 'like', 'learning'): 1, ('like', 'learning', 'about'): 1,
+                       ('learning', 'about', 'ia'): 1}
+
+    assert len(expected_ngrams) == len(ngrams)
+
+    for key, value in expected_ngrams.items():
+        assert key in ngrams
+        assert ngrams[key] == value
+
+
 def test_rules():
    assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}