|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
ObjectSpellWriter
public class SpellWriter
Writes spelling dictionaries, which can later be used by SpellReader
to obtain spelling suggestions. Provides efficient, high-volume updates
to a spelling correction dictionary. Typical steps for creating a dictionary:
Inspired by and very distantly based on Nicolas Maisonneuve / David Spencer code.
Field Summary | |
---|---|
private static int |
DEFAULT_MIN_PAIR_FREQ
Default minimum pair frequency = 2 |
private static int |
DEFAULT_MIN_WORD_FREQ
Default minimum word frequency = 2 |
private static DoubleMetaphone |
doubleMetaphone
Used for calculating double metaphone keys |
private StringBuffer |
edmapBuf
String buffer for edmap pairs |
private File |
edmapFile
File containing edit map data |
private File |
freqFile
File containing compiled word frequencies |
private char[] |
keyChars
Character array for forming combo keys |
private static int |
MAX_RECENT_PAIRS
Max # of pairs to hash before flushing |
private static int |
MAX_RECENT_WORDS
How large to make the cache of recently added words |
private int |
minPairFreq
Minimum frequency for pairs to retain |
private int |
minWordFreq
Minimum frequency for words to retain |
private File |
pairFreqFile
File containing compiled pair frequency data |
private File |
pairQueueFile
File to queue words into |
private PrintWriter |
pairQueueWriter
For writing to the pair queue |
private String |
prevWord
The previous word queued, or null if none (or a break was queued) |
private HashMap<String,Integer> |
recentPairs
For counting pair frequencies prior to write |
private HashMap<String,Integer> |
recentWords
For counting word frequencies prior to write |
private File |
sampleFile
File containing frequency sample data |
private int |
SORT_MEM_LIMIT
Memory limit for sorting |
private File |
spellIndexDir
Directory to store the spelling dictionary in |
(package private) Pattern |
splitPat
Used for splitting lines delimited with bar |
private Set |
stopSet
Set of stop words in use; default is null for no stop set |
private File |
wordQueueFile
File to queue words into |
private PrintWriter |
wordQueueWriter
For writing to the word queue |
Constructor Summary | |
---|---|
private |
SpellWriter()
Private constructor -- do not construct directly; rather, use the static open(File) method. |
Method Summary | |
---|---|
private void |
addCombo(String word,
FileSorter edmapSorter,
int p0,
int p1,
int p2,
int p3)
Add a combination of letters to the edit map |
private void |
addCombos(String word,
FileSorter edMapSorter)
Add combinations of the first six letters of the word, capturing all the possibilities that represent an edit distance of 2 or less. |
boolean |
anyWordsQueued()
Check if any words are queued for add. |
static String |
calcMetaphone(String word)
|
void |
clearDictionary()
Delete all words in the dictionary (including those queued on disk) |
void |
close()
Closes all files. |
private void |
closeQueueWriters()
Closes the queue writers if either are open |
private char |
comboChar(char c)
|
private char[] |
comboKey(String word,
int p0,
int p1,
int p2,
int p3)
Calculate a key from the given characters of the word. |
private void |
condenseEdmapKey(String key,
ArrayList<String> words,
Writer out)
Perform prefix compression on a list of words for a single edit map key. |
private void |
deleteFile(File file)
Attempt to delete (and at least truncate) the given file. |
protected void |
finalize()
|
private void |
flushPhase1(ProgressTracker prog)
Performs the word-adding phase of the flush procedure. |
private void |
flushPhase2(ProgressTracker prog)
Performs the pair-adding phase of the flush procedure. |
void |
flushQueuedWords()
Ensures that all words in the queue are written to the dictionary on disk. |
void |
flushQueuedWords(ProgressTracker prog)
Ensures that all words in the queue are written to the dictionary on disk. |
private void |
flushRecentPairs()
Flush any accumulated pairs, with their counts. |
private void |
flushRecentWords()
Flush any accumulated words, with their counts. |
static SpellWriter |
open(File spellIndexDir)
Creates a SpellWriter, and establishes the directory to store the dictionary in. |
private void |
openInternal(File spellIndexDir)
Establishes the directory to store the dictionary in. |
private void |
openPairQueueWriter()
Opens the pair queue writer. |
private void |
openWordQueueWriter()
Opens the word queue writer. |
void |
queueBreak()
Called to signal a break in the text, to inform the spell checker to avoid pairing the previous word with the next one. |
void |
queueWord(String word)
Queue the given word. |
private void |
readFreqs(File inFile,
FileSorter out,
ProgressTracker prog)
Read an existing frequency file, and add it to a file sorter. |
private void |
replaceFile(File oldFile,
File newFile)
Replace an old file with a new one |
void |
setMinPairFreq(int freq)
Establish a minimum pair frequency. |
void |
setMinWordFreq(int freq)
Establish a minimum word frequency. |
void |
setStopwords(Set set)
Establishes a set of stop words (e.g. |
private void |
writeEdMap(FileSorter edmapSorter,
File outFile,
ProgressTracker prog)
Write out a prefix-compressed edit-distance map, which also contains term frequencies. |
private void |
writeFreqs(File outFile,
FileSorter freqSorter,
IntList allFreqs,
FileSorter edmapSorter,
ProgressTracker prog)
Write out frequency data, in sorted order. |
private void |
writeFreqSamples(IntList allFreqs,
File file,
ProgressTracker prog)
Write term frequency samples to the given file. |
Methods inherited from class Object |
---|
clone, equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private File spellIndexDir
private Set stopSet
private File wordQueueFile
private String prevWord
private File pairQueueFile
private File freqFile
private File sampleFile
private File edmapFile
private File pairFreqFile
private PrintWriter wordQueueWriter
private PrintWriter pairQueueWriter
private static final int MAX_RECENT_WORDS
private HashMap<String,Integer> recentWords
private static final int MAX_RECENT_PAIRS
private HashMap<String,Integer> recentPairs
private static final int DEFAULT_MIN_WORD_FREQ
private int minWordFreq
private static final int DEFAULT_MIN_PAIR_FREQ
private int minPairFreq
private static DoubleMetaphone doubleMetaphone
Pattern splitPat
private int SORT_MEM_LIMIT
private char[] keyChars
private StringBuffer edmapBuf
Constructor Detail |
---|
private SpellWriter()
open(File)
method.
Method Detail |
---|
public static SpellWriter open(File spellIndexDir) throws IOException
setStopwords(Set)
after opening a writer.
The minimum word frequency defaults to 2; if you want to
override that, call setMinWordFreq(int)
.
A similar threhold exists for pairs; the minimum pair frequency defaults
to 2; if you want to override that, call setMinPairFreq(int)
.
spellIndexDir
- Directory in which to store the spelling dictionary
IOException
private void openInternal(File spellIndexDir) throws IOException
IOException
public void setStopwords(Set set)
set
- the set of stop words to usepublic void setMinWordFreq(int freq)
freq
- the new minimum word frequencypublic void setMinPairFreq(int freq)
freq
- the new minimum pair frequencypublic void close() throws IOException
IOException
public void clearDictionary() throws IOException
IOException
public void queueWord(String word) throws IOException
IOException
public void queueBreak()
private void flushRecentPairs() throws IOException
IOException
private void flushRecentWords() throws IOException
IOException
public boolean anyWordsQueued() throws IOException
IOException
public void flushQueuedWords() throws IOException
flushQueuedWords(ProgressTracker)
below.
IOException
public void flushQueuedWords(ProgressTracker prog) throws IOException
prog
- A tracker that will be called periodically during the
process; generally you'll want to supply one that
prints out progress messages.
If null, no progress will be reported.
IOException
private void flushPhase1(ProgressTracker prog) throws IOException
IOException
- if something goes wrongprivate void readFreqs(File inFile, FileSorter out, ProgressTracker prog) throws IOException
IOException
private void writeFreqs(File outFile, FileSorter freqSorter, IntList allFreqs, FileSorter edmapSorter, ProgressTracker prog) throws IOException
IOException
private void addCombos(String word, FileSorter edMapSorter) throws IOException
IOException
private void addCombo(String word, FileSorter edmapSorter, int p0, int p1, int p2, int p3) throws IOException
IOException
private char[] comboKey(String word, int p0, int p1, int p2, int p3)
private char comboChar(char c)
private void writeFreqSamples(IntList allFreqs, File file, ProgressTracker prog) throws IOException
IOException
private void writeEdMap(FileSorter edmapSorter, File outFile, ProgressTracker prog) throws IOException
IOException
private void condenseEdmapKey(String key, ArrayList<String> words, Writer out) throws IOException
IOException
private void deleteFile(File file) throws IOException
IOException
private void replaceFile(File oldFile, File newFile)
private void flushPhase2(ProgressTracker prog) throws IOException
IOException
private void openWordQueueWriter() throws IOException
IOException
private void openPairQueueWriter() throws IOException
IOException
private void closeQueueWriters() throws IOException
IOException
public static String calcMetaphone(String word)
protected void finalize() throws Throwable
finalize
in class Object
Throwable
|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |