I am trying to create a word counter that returns an array of the number of times each word in a text file appears. Moreover, I have been assigned to parallelize this program.
My initial attempt at this task was as follows
Implementation 1
func WordCount(words []string, startWord int, endWord int, waitGroup *sync.WaitGroup, freqsChannel chan<- map[string]int) {
freqs := make(map[string]int)
for i := startWord; i < endWord; i++ {
word := words[i]
freqs[word]++
}
freqsChannel <- freqs
waitGroup.Done()
}
func ParallelWordCount(text string) map[string]int {
// Split text into string array of the words in text.
text = strings.ToLower(text)
text = strings.ReplaceAll(text, ",", "")
text = strings.ReplaceAll(text, ".", "")
words := strings.Fields(text)
length := len(words)
threads := 28
freqsChannel := make(chan map[string]int, threads)
var waitGroup sync.WaitGroup
waitGroup.Add(threads)
defer waitGroup.Wait()
wordsPerThread := length / threads // always rounds down
wordsInLastThread := length - (threads-1)*wordsPerThread
startWord := -wordsPerThread
endWord := 0
for i := 1; i <= threads; i++ {
if i < threads {
startWord += wordsPerThread
endWord += wordsPerThread
} else {
startWord += wordsInLastThread
endWord += wordsInLastThread
}
go WordCount(words, startWord, endWord, &waitGroup, freqsChannel)
}
freqs := <-freqsChannel
for i := 1; i < threads; i++ {
subFreqs := <-freqsChannel
for word, count := range subFreqs {
freqs[word] += count
}
}
return freqs
}
According to my teaching assistant, this was not a good solution as the pre-processing of the text file carried out by
text = strings.ToLower(text)
text = strings.ReplaceAll(text, ",", "")
text = strings.ReplaceAll(text, ".", "")
words := strings.Fields(text)
in ParallelWordCount
goes against the idea of parallel processing.
Now, to fix this, I have moved the responsibility of processing the text file into an array of words into the the WordCount
function that is called on separate goroutines for different parts of the text file. Below is the code for my second implementation.
Implementation 2
func WordCount(text string, waitGroup *sync.WaitGroup, freqsChannel chan<- map[string]int) {
freqs := make(map[string]int)
text = strings.ToLower(text)
text = strings.ReplaceAll(text, ",", "")
text = strings.ReplaceAll(text, ".", "")
words := strings.Fields(text)
for _, value := range words {
freqs[value]++
}
freqsChannel <- freqs
waitGroup.Done()
}
func splitCount(str string, subStrings int, waitGroup *sync.WaitGroup, freqsChannel chan<- map[string]int) {
if subStrings != 1 {
length := len(str)
charsPerSubstring := length / subStrings
i := 0
for str[charsPerSubstring+i] != ' ' {
i++
}
subString := str[0 : charsPerSubstring+i+1]
go WordCount(subString, waitGroup, freqsChannel)
splitCount(str[charsPerSubstring+i+1:length], subStrings-1, waitGroup, freqsChannel)
} else {
go WordCount(str, waitGroup, freqsChannel)
}
}
func ParallelWordCount(text string) map[string]int {
threads := 28
freqsChannel := make(chan map[string]int, threads)
var waitGroup sync.WaitGroup
waitGroup.Add(threads)
defer waitGroup.Wait()
splitCount(text, threads, &waitGroup, freqsChannel)
// Collect and return frequences
freqs := <-freqsChannel
for i := 1; i < threads; i++ {
subFreqs := <-freqsChannel
for word, count := range subFreqs {
freqs[word] += count
}
}
return freqs
}
The average runtime of this implementation is 3 ms compared to the old average of 5 ms, but have I thoroughly addressed the issue raised by my teaching assistant or does the second implementation also not take full advantage of parallel processing to efficiently count the words of text file?