aimacode · antmarakis · Mar 28, 2017 · antmarakis · Apr 2, 2017 · Chipe1
diff --git a/text.py b/text.py
@@ -18,11 +18,11 @@
 class UnigramTextModel(CountingProbDist):

    """This is a discrete probability distribution over words, so you
-    can add, sample, or get P[word], just like with CountingProbDist.  You can
-    also generate a random text n words long with P.samples(n)"""
+    can add, sample, or get P[word], just like with CountingProbDist. You can
+    also generate a random text n words long with P.samples(n)."""

    def samples(self, n):
-        "Return a string of n words, random according to the model."
+        """Return a string of n words, random according to the model."""
        return ' '.join(self.sample() for i in range(n))


@@ -97,12 +97,13 @@ def viterbi_segment(text, P):
    n = len(text)
    words = [''] + list(text)
    best = [1.0] + [0.0] * n
-    # Fill in the vectors best, words via dynamic programming
+    # Fill in the vectors best words via dynamic programming
    for i in range(n+1):
        for j in range(0, i):
            w = text[j:i]
-            if P[w] * best[i - len(w)] >= best[i]:
-                best[i] = P[w] * best[i - len(w)]
+            curr_score = P[w] * best[i - len(w)]
+            if curr_score >= best[i]:
+                best[i] = curr_score
                words[i] = w
    # Now recover the sequence of best words
    sequence = []
@@ -124,7 +125,7 @@ class IRSystem:
    The constructor s = IRSystem('the a') builds an empty system with two
    stopwords. Next, index several documents with s.index_document(text, url).
    Then ask queries with s.query('query words', n) to retrieve the top n
-    matching documents.  Queries are literal words from the document,
+    matching documents. Queries are literal words from the document,
    except that stopwords are ignored, and there is one special syntax:
    The query "learn: man cat", for example, runs "man cat" and indexes it."""

@@ -137,14 +138,14 @@ def __init__(self, stopwords='the a of'):
        self.documents = []

    def index_collection(self, filenames):
-        "Index a whole collection of files."
+        """Index a whole collection of files."""
        prefix = os.path.dirname(__file__)
        for filename in filenames:
            self.index_document(open(filename).read(),
                                os.path.relpath(filename, prefix))

    def index_document(self, text, url):
-        "Index the text of a document."
+        """Index the text of a document."""
        # For now, use first line for title
        title = text[:text.index('\n')].strip()
        docwords = words(text)
@@ -278,7 +279,7 @@ def maketrans(from_, to_):


 def encode(plaintext, code):
-    """Encodes text, using a code which is a permutation of the alphabet."""
+    """Encode text, using a code which is a permutation of the alphabet."""
    trans = maketrans(alphabet + alphabet.upper(), code + code.upper())

    return translate(plaintext, trans)
@@ -298,7 +299,7 @@ def bigrams(text):

 class ShiftDecoder:

-    """There are only 26 possible encodings, so we can try all of them,
+    """There are only 26 possible encodings, so we can try all of them
    and return the one with the highest probability, according to a
    bigram probability distribution."""

@@ -333,19 +334,18 @@ def all_shifts(text):

 class PermutationDecoder:

-    """This is a much harder problem than the shift decoder.  There are 26!
-    permutations, so we can't try them all.  Instead we have to search.
+    """This is a much harder problem than the shift decoder. There are 26!
+    permutations, so we can't try them all. Instead we have to search.
    We want to search well, but there are many things to consider:
    Unigram probabilities (E is the most common letter); Bigram probabilities
    (TH is the most common bigram); word probabilities (I and A are the most
    common one-letter words, etc.); etc.
-      We could represent a search state as a permutation of the 26 letters,
-    and alter the solution through hill climbing.  With an initial guess
+    We could represent a search state as a permutation of the 26 letters,
+    and alter the solution through hill climbing. With an initial guess
    based on unigram probabilities, this would probably fare well. However,
    I chose instead to have an incremental representation. A state is
    represented as a letter-to-letter map; for example {'z': 'e'} to
-    represent that 'z' will be translated to 'e'.
-    """
+    represent that 'z' will be translated to 'e'."""

    def __init__(self, training_text, ciphertext=None):
        self.Pwords = UnigramTextModel(words(training_text))