@PublicationReference(author="Peter Norvig", title="How to Write a Spelling Corrector", year=2009, type=WebPage, url="http://norvig.com/spell-correct.html") public class SimpleStatisticalSpellingCorrector extends AbstractCloneableSerializable implements Evaluator<java.lang.String,java.lang.String>
| Modifier and Type | Class and Description |
|---|---|
static class |
SimpleStatisticalSpellingCorrector.Learner
A learner for the
SimpleStatisticalSpellingCorrector. |
| Modifier and Type | Field and Description |
|---|---|
protected char[] |
alphabet
The alphabet of lower case characters.
|
protected DefaultDataDistribution<java.lang.String> |
wordCounts
Maps known words to the number of times they've been seen.
|
| Constructor and Description |
|---|
SimpleStatisticalSpellingCorrector()
Creates a new, default
SimpleStatisticalSpellingCorrector with
a default alphabet. |
SimpleStatisticalSpellingCorrector(char[] alphabet)
Creates a new
SimpleStatisticalSpellingCorrector with a given
alphabet. |
SimpleStatisticalSpellingCorrector(DefaultDataDistribution<java.lang.String> wordCounts,
char[] alphabet)
Creates a new
SimpleStatisticalSpellingCorrector. |
| Modifier and Type | Method and Description |
|---|---|
void |
add(java.lang.String word)
Adds a word to the dictionary of counts for the spelling corrector.
|
void |
add(java.lang.String word,
int count)
Adds a given number of counts for a word to the dictionary of counts for
the spelling corrector.
|
static char[] |
createDefaultAlphabet()
Creates the default alphabet, which are the lower-case English letters.
|
java.lang.String |
evaluate(java.lang.String word)
Evaluates the function on the given input and returns the output.
|
java.lang.String |
findBest(java.lang.Iterable<java.lang.String> words,
java.lang.String defaultBestWord)
Finds the best word from a given list of words by finding the one with
the highest count in the dictionary.
|
char[] |
getAlphabet()
Gets the alphabet of lower-case characters that can be used for replaces
and inserts.
|
DefaultDataDistribution<java.lang.String> |
getWordCounts()
Gets the dictionary of word counts.
|
protected java.util.Set<java.lang.String> |
knownTwoCharacterEdits(java.lang.Iterable<java.lang.String> oneCharacterEdits)
Creates the set of known two character edits for a given list of one
character edits.
|
protected void |
possibleOneCharacterEdits(java.lang.String word,
java.util.Collection<java.lang.String> result)
Lists all possible one-character edits for a given word by looking at
character deletes, transposes, replaces, and inserts.
|
void |
setAlphabet(char[] alphabet)
Sets the alphabet of lower-case characters that can be used for replaces
and inserts.
|
void |
setWordCounts(DefaultDataDistribution<java.lang.String> wordCounts)
Sets the dictionary of words counts.
|
cloneprotected DefaultDataDistribution<java.lang.String> wordCounts
protected char[] alphabet
public SimpleStatisticalSpellingCorrector()
SimpleStatisticalSpellingCorrector with
a default alphabet.public SimpleStatisticalSpellingCorrector(char[] alphabet)
SimpleStatisticalSpellingCorrector with a given
alphabet.alphabet - The alphabet to use.public SimpleStatisticalSpellingCorrector(DefaultDataDistribution<java.lang.String> wordCounts, char[] alphabet)
SimpleStatisticalSpellingCorrector.wordCounts - The initial word counts.alphabet - The alphabet to use.public static char[] createDefaultAlphabet()
public void add(java.lang.String word)
word - The word to add an occurrence of.public void add(java.lang.String word,
int count)
word - The word to add.count - The count of occurrences.public java.lang.String evaluate(java.lang.String word)
Evaluatorpublic java.lang.String findBest(java.lang.Iterable<java.lang.String> words,
java.lang.String defaultBestWord)
words - The list of words.defaultBestWord - The default word to return if none are in the dictionary.protected void possibleOneCharacterEdits(java.lang.String word,
java.util.Collection<java.lang.String> result)
word - The word to get the edits for.result - The collection to write the edits into.protected java.util.Set<java.lang.String> knownTwoCharacterEdits(java.lang.Iterable<java.lang.String> oneCharacterEdits)
oneCharacterEdits - The list of one character edits.public DefaultDataDistribution<java.lang.String> getWordCounts()
public void setWordCounts(DefaultDataDistribution<java.lang.String> wordCounts)
wordCounts - The dictionary of word counts.public char[] getAlphabet()
public void setAlphabet(char[] alphabet)
alphabet - The alphabet of lower-case characters.