Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions 147 src/main/java/com/matchings/stringMatching/Horspool.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package com.matchings.stringMatching;

import java.util.HashMap;

/**
* (From wikipedia)
* In computer science, the Boyer–Moore–Horspool algorithm or Horspool's algorithm is an algorithm for finding
* substrings in strings. It was published by Nigel Horspool in 1980.
*
* An explanation:
*
* The Horspool algorithm is a simplification of the Boyer-Moore algorithm in that it uses only one of the two heuristic
* methods for increasing the number of characters shifted when finding a bad match in the text. This method is usually
* called the "bad symbol" or "bad character" shift. The bad symbol shift method is classified as an input enhancement
* method in the theory of algorithms. Input enhancement is (from wikipedia) the principle that processing a given input
* to a problem and altering it in a specific way will increase runtime efficiency or space efficiency, or both. Both
* algorithms try to match the pattern and text comparing the pattern symbols to the text's from right to left.
*
* In the bad symbol shift method, a table is created prior to the search, called the "bad symbol table". The bad symbol
* table contains the shift values for any symbol in the text and pattern. For these symbols, the value is the length of
* the pattern, if the symbol is not in the first (length - 1) of the pattern. Else it is the distance from its
* rightmost occurence in the pattern to the last symbol of the pattern. In practice, we only calculate the values for
* the ones that exist in the first (length - 1) of the pattern.
*
* For more details on the algorithm and the more advanced Boyer-Moore I recommend checking out the wikipedia page and
* professor Anany Levitin's book: Introduction To The Design And Analysis Of Algorithms.
*/
public class Horspool {

private static HashMap<Character, Integer> shiftValues; // bad symbol table
private static Integer patternLength;
private static int comparisons = -1; // total comparisons in the current/last search

/**
* Non case sensitive version of the algorithm
*/
public static Integer findFirstOccurrence(String pattern, String text) {
return firstOccurrence(pattern, text, false);
}

/**
* Case sensitive version
*/
public static Integer findFirstExactOccurrence(String pattern, String text) {
return firstOccurrence(pattern, text, true);
}

/**
* Fairly standard implementation of the Horspool algorithm. Only the index of the last character of the pattern on the
* text is saved and shifted by the appropriate amount when a mismatch is found. The algorithm stops at the first
* match or when the entire text has been exhausted.
*
* @param pattern String to be matched in the text
* @param text text String
* @return index of first occurrence of the pattern in the text
*/
private static Integer firstOccurrence(String pattern, String text, boolean caseSensitive) {
shiftValues = calcShiftValues(pattern); // build the bad symbol table
comparisons = 0; // reset comparisons

int textIndex = pattern.length() - 1; // align pattern with text start and get index of the last character

// while pattern is not out of text bounds
while (textIndex < text.length()) {

// try to match pattern with current part of the text starting from last character
int i=pattern.length() - 1;
while (i>=0) {
comparisons++;
if (!charEquals(pattern.charAt(i), text.charAt(textIndex + i - pattern.length() + 1), caseSensitive)) { // bad character, shift pattern
textIndex += getShiftValue(text.charAt(textIndex));
break;
}
i--;
}

// check for full match
if (i == -1) {
return textIndex - pattern.length() + 1;
}
}

// text exhausted, return failure
return null;
}

/**
* Utility method (mainly for tests)
*
* @return number of character comparisons of the last search
*/
public static Integer getLastComparisons() {
return Horspool.comparisons;
}

/**
* Compares the argument characters
*
* @param c1 first character
* @param c2 second character
* @param caseSensitive boolean determining case sensitivity of comparison
* @return truth value of the equality comparison
*/
private static boolean charEquals(char c1, char c2, boolean caseSensitive) {
if (caseSensitive) {
return c1 == c2;
}
return Character.toLowerCase(c1) == Character.toLowerCase(c2);
}

/**
* Builds the bad symbol table required to run the algorithm. The method starts from the second to last character
* of the pattern and moves to the left. When it meets a new character, it is by definition its rightmost occurrence
* and therefore puts the distance from the current index to the index of the last character into the table. If the
* character is already in the table, then it is not a rightmost occurrence, so it continues.
*
* @param pattern basis for the bad symbol table
* @return the bad symbol table
*/
private static HashMap<Character, Integer> calcShiftValues(String pattern) {
patternLength = pattern.length();
HashMap<Character, Integer> table = new HashMap<>();

for (int i=pattern.length()-2; i>=0; i--) { // length - 2 is the index of the second to last character
char c = pattern.charAt(i);
int finalI = i;
table.computeIfAbsent(c, k -> pattern.length() - 1 - finalI);
}

return table;
}

/**
* Helper function that uses the bad symbol shift table to return the appropriate shift value for a given character
*
* @param c character
* @return shift value that corresponds to the character argument
*/
private static Integer getShiftValue(char c) {
if (shiftValues.get(c) != null) {
return shiftValues.get(c);
} else {
return patternLength;
}
}

}
56 changes: 56 additions & 0 deletions 56 src/test/java/com/matchings/stringMatching/HorspoolTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package com.matchings.stringMatching;

import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

public class HorspoolTest {

@Test
void testFirstOccurrence() {
String text = "BESS_KNEW_ABOUT_BAOBABS";

String patternThatExists = "BAOBAB";
Integer expectedIndex = 16;
assertEquals(expectedIndex, Horspool.findFirstOccurrence(patternThatExists, text));

String patterThatDoesNotExist = "BESSY";
assertNull(Horspool.findFirstOccurrence(patterThatDoesNotExist, text));
}

@Test
void testFirstExactOccurrence() {
String text = "Manual refactoring is not recommended";

String patterThatExists = "ring is not";
Integer expectedIndex = 14;
assertEquals(expectedIndex, Horspool.findFirstExactOccurrence(patterThatExists, text));

String patternThatDoesNotExist = "man";
assertNull(Horspool.findFirstExactOccurrence(patternThatDoesNotExist, text));
}

/**
* Test number of character comparisons in searching for: AB, ABB, ABBB, ABBBB in 1000 As
*/
@Test
void testComparisons() {
final int N = 1000;

StringBuilder builder = new StringBuilder(N);
for (int i=0; i<N; i++) {
builder.append('A');
}

String text = builder.toString();
String[] patterns = { "AB", "ABB", "ABBB", "ABBBB" };

for (String pattern : patterns) {
assertNull(Horspool.findFirstOccurrence(pattern, text));
Long expected = Math.round(Math.ceil(
(double) (text.length() - pattern.length() + 1) / (pattern.length() - 1))); // 999, 499, 333, 249
assertEquals(expected, (long) Horspool.getLastComparisons());
}
}

}
Morty Proxy This is a proxified and sanitized view of the page, visit original site.