Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions 98 nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,104 @@
import urllib.request
import re

# ______________________________________________________________________________
# N-gram

class NGram:
"""This class describe an n-gram model for a range of text.
An NGram is specified by the following inputs:
texts An array of text that will be used to create the n-gram
model
n The size of the n-gram pair that will be generated.
characters Boolean indicating if the n-gram will considers characters
or words
"""

def __init__(self, texts, n, characters=False):
self.texts = texts
self.n = n
self.characters = characters
self.ngrams = {}
self.num_ngrams = 0

"""If ngram is not present, return a small probability for it"""
self.probability_ngrams = defaultdict(lambda: 0.00000000001)

"""regex to remove special characters from text"""
self.special_characters_regex = re.compile('[^0-9a-zA-Z\s]+')



def create_ngrams(self):
for text in self.texts:
self.count_ngrams(text)

for value in self.ngrams.values():
self.num_ngrams += value

def populate_ngram(self, token):
size = len(token)

if size < self.n:
return self.ngrams

for i in range(size - self.n + 1):
word_sequence = tuple(token[i:i + self.n])
value = self.ngrams.setdefault(word_sequence, 0)
self.ngrams[word_sequence] = value + 1

def create_tokens(self, text):
"""Remove start and ending spaces"""
text = text.strip()

"""Remove special characters from text"""
text = self.special_characters_regex.sub('', text)

"""Guarantee that all text string are on lower case"""
text = text.lower()

"""Create tokens"""
tokens = text.split(' ')

"""Remove empty tokens"""
tokens = list(filter(None, tokens))

"""Verify if any token has some unecessary empty space"""
for token in tokens:
token = token.strip()

return tokens

def count_ngrams(self, text):
tokens = self.create_tokens(text)

if self.characters:
for token in tokens:
self.populate_ngram(token)
else:
self.populate_ngram(tokens)

return self.ngrams

def generate_probability(self):
for key, value in self.ngrams.items():
self.probability_ngrams[key] = value / self.num_ngrams


class Unigram(NGram):
def __init__(self, texts, characters=False):
super().__init__(texts, n=1, characters=characters)


class Bigram(NGram):
def __init__(self, texts, characters=False):
super().__init__(texts, n=2, characters=characters)


class Trigram(NGram):
def __init__(self, texts, characters=False):
super().__init__(texts, n=3, characters=characters)

# ______________________________________________________________________________
# Grammars and Lexicons

Expand Down
79 changes: 79 additions & 0 deletions 79 tests/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,89 @@
from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
from nlp import getOutlinks, Page
from nlp import Rules, Lexicon
from nlp import Unigram, Bigram, Trigram
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
# they are accessed by function's within nlp.py


def test_ngram_character_count():
text_string = 'I like programming'

unigram = Unigram([], characters=True)
ngrams = unigram.count_ngrams(text_string)
expected_ngrams = {('l',): 1, ('i',): 3, ('k',): 1, ('e',): 1,
('p',): 1, ('r',): 2, ('o',): 1, ('g',): 2, ('a',): 1, ('m',): 2,
('n',): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams
assert ngrams[key] == value

bigram = Bigram([], characters=True)
ngrams = bigram.count_ngrams(text_string)
expected_ngrams = {('l', 'i'): 1, ('i', 'k'): 1, ('k', 'e'): 1, ('p', 'r'): 1,
('r', 'o'): 1, ('o', 'g'): 1, ('g', 'r'): 1, ('r', 'a'): 1, ('a', 'm'): 1,
('m', 'm'): 1, ('m', 'i'): 1, ('i', 'n'): 1, ('n', 'g'): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams

trigram = Trigram([], characters=True)
ngrams = trigram.count_ngrams(text_string)
expected_ngrams = {('l', 'i', 'k'): 1, ('i', 'k', 'e'): 1, ('p', 'r', 'o'): 1,
('r', 'o', 'g'): 1, ('o', 'g', 'r'): 1, ('g', 'r', 'a'): 1,
('g', 'r', 'a'): 1, ('r', 'a', 'm'): 1, ('a', 'm', 'm'): 1,
('m', 'm', 'i'): 1, ('m', 'i', 'n'): 1, ('i', 'n', 'g'): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams
assert ngrams[key] == value
assert ngrams[key] == value


def test_ngram_word_count():
text_string = "I like learning about IA"

unigram = Unigram([])
ngrams = unigram.count_ngrams(text_string)
expected_ngrams = {('i',): 1, ('like',): 1, ('learning',): 1,
('about',): 1, ('ia',): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams
assert ngrams[key] == value

ngram = Bigram([])
ngrams = ngram.count_ngrams(text_string)
expected_ngrams = {('i', 'like'): 1, ('like', 'learning'): 1, ('learning', 'about'): 1,
('about', 'ia'): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams
assert ngrams[key] == value

ngram = Trigram([])
ngrams = ngram.count_ngrams(text_string)
expected_ngrams = {('i', 'like', 'learning'): 1, ('like', 'learning', 'about'): 1,
('learning', 'about', 'ia'): 1}

assert len(expected_ngrams) == len(ngrams)

for key, value in expected_ngrams.items():
assert key in ngrams
assert ngrams[key] == value


def test_rules():
assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}

Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.