Linear-Time Substring Search Using the KMP Algorithm in Java
The Knuth-Morris-Pratt algorithm locates occurrences of a pattern with in a text in O(n) time by preprocessing the pattern to determine valid shift distances. Unlike naive approaches that restart comparisons from the beginning after mismatches, KMP utilizes the structure of the pattern itself to skip redundant checks.
Prefix Function Construction
The algorithm relies on a prefix function (commonly called the LPS array) that stores, for each position in the pattern, the length of the longest proper prefix wich is also a suffix for the substring ending at that position.
Consider processing the pattern P = "ABABC":
- Position 0 ('A'): No proper prefix/suffix, value 0
- Position 1 ('B'): No match, value 0
- Position 2 ('A'): Matches prefix[0], value 1
- Position 3 ('B'): Extends previous match to "AB", value 2
- Position 4 ('C'): Mismatch, fallback to previous border, no match found, value 0
Resulting LPS array: [0, 0, 1, 2, 0]
Matching Process
During the search phase, maintain two indices: txtIdx traversing the text and patIdx tracking the pattern position. When characters match, advance both. Upon mismatch with patIdx > 0, reset patIdx using lps[patIdx - 1] instead of reverting txtIdx. If patIdx reaches zero, simply advance txtIdx.
Implementation: Basic Pattern Search
import java.util.Scanner;
public class PatternMatcher {
public static void main(String[] args) {
Scanner input = new Scanner(System.in);
String text = input.next();
String pattern = input.next();
int[] borders = computeBorders(pattern);
executeSearch(text, pattern, borders);
for (int val : borders) {
System.out.print(val + " ");
}
}
static void executeSearch(String text, String pattern, int[] borders) {
int textLen = text.length();
int patLen = pattern.length();
int t = 0, p = 0;
while (t < textLen) {
if (pattern.charAt(p) == text.charAt(t)) {
p++;
t++;
}
if (p == patLen) {
System.out.println(t - p + 1);
p = borders[p - 1];
} else if (t < textLen && pattern.charAt(p) != text.charAt(t)) {
if (p != 0) {
p = borders[p - 1];
} else {
t++;
}
}
}
}
static int[] computeBorders(String pattern) {
int length = pattern.length();
int[] borders = new int[length];
int candidate = 0;
int pos = 1;
while (pos < length) {
if (pattern.charAt(pos) == pattern.charAt(candidate)) {
candidate++;
borders[pos] = candidate;
pos++;
} else {
if (candidate != 0) {
candidate = borders[candidate - 1];
} else {
borders[pos] = 0;
pos++;
}
}
}
return borders;
}
}
Implementation: Repeated Patern Construction
This variant determines how many times a string must be appended to itself to contain a target repetition count:
import java.util.Scanner;
public class StringRepetition {
public static void main(String[] args) {
Scanner sc = new Scanner(System.in);
int strLen = sc.nextInt();
int repeatCount = sc.nextInt();
sc.nextLine();
String raw = " " + sc.nextLine();
char[] chars = raw.toCharArray();
int[] prefix = new int[55];
int i = 2, j = 0;
while (i <= strLen) {
while (j > 0 && chars[i] != chars[j + 1]) {
j = prefix[j];
}
if (chars[i] == chars[j + 1]) {
prefix[i] = ++j;
i++;
} else {
prefix[i] = j;
i++;
}
}
for (int k = 1; k <= strLen; k++) {
System.out.print(chars[k]);
}
for (int k = 2; k <= repeatCount; k++) {
for (int m = prefix[strLen] + 1; m <= strLen; m++) {
System.out.print(chars[m]);
}
}
}
}
Implementation: Occurrence Counting
Counting pattern occurrences within a text:
import java.util.*;
public class OccurrenceCounter {
public int countMatches(String needle, String haystack) {
int nLen = needle.length();
int hLen = haystack.length();
if (nLen > hLen || hLen == 0) return 0;
int matches = 0;
int[] fallback = buildFallback(needle);
for (int h = 0, n = 0; h < hLen; h++) {
while (n > 0 && haystack.charAt(h) != needle.charAt(n)) {
n = fallback[n - 1];
}
if (haystack.charAt(h) == needle.charAt(n)) {
n++;
}
if (n == nLen) {
matches++;
n = fallback[n - 1];
}
}
return matches;
}
private int[] buildFallback(String pattern) {
int pLen = pattern.length();
int[] fallback = new int[pLen];
for (int idx = 1, matchLen = 0; idx < pLen; idx++) {
while (matchLen > 0 && pattern.charAt(idx) != pattern.charAt(matchLen)) {
matchLen = fallback[matchLen - 1];
}
if (pattern.charAt(idx) == pattern.charAt(matchLen)) {
matchLen++;
}
fallback[idx] = matchLen;
}
return fallback;
}
}
Implementation: Longest Prefix Match
Finding the longest prefix of pattern matching a substring of text:
import java.util.Scanner;
public class PrefixMatcher {
static int maxMatch = 0;
static int[] buildPartialTable(char[] pattern) {
int m = pattern.length;
int[] table = new int[m];
table[0] = 0;
int k = 0;
for (int q = 1; q < m; q++) {
while (k > 0 && pattern[q] != pattern[k]) {
k = table[k - 1];
}
if (pattern[q] == pattern[k]) {
k++;
}
table[q] = k;
}
return table;
}
static int findLongestPrefix(char[] text, char[] pattern) {
int[] table = buildPartialTable(pattern);
int q = 0;
for (int i = 0; i < text.length; i++) {
while (q > 0 && text[i] != pattern[q]) {
q = table[q - 1];
}
if (text[i] == pattern[q]) {
q++;
maxMatch = Math.max(maxMatch, q);
}
if (q == pattern.length) {
q = table[q - 1];
}
}
return maxMatch;
}
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
char[] text = scanner.nextLine().toCharArray();
char[] pattern = scanner.nextLine().toCharArray();
System.out.println(findLongestPrefix(text, pattern));
}
}
Implementation: String Periodicity
Detecting the smallest period of a string using the failure function:
import java.util.Scanner;
public class PeriodicityDetector {
private static int[] computePrefix(String s) {
int n = s.length();
int[] pi = new int[n];
for (int i = 1, j = 0; i < n; i++) {
while (j > 0 && s.charAt(i) != s.charAt(j)) {
j = pi[j - 1];
}
if (s.charAt(i) == s.charAt(j)) {
j++;
}
pi[i] = j;
}
return pi;
}
public static void main(String[] args) {
Scanner sc = new Scanner(System.in);
String str = sc.next();
int[] pi = computePrefix(str);
int n = str.length();
int period = n - pi[n - 1];
if (n % period == 0) {
System.out.println(n / period);
} else {
System.out.println(1);
}
}
}