# ======================================================================== # JANUS-SR Janus Speech Recognition Toolkit # ------------------------------------------------------------ # Author : Martin Westphal # Module : bigram.tcl # Date : 20.Oct.97 # # Remarks : Read in a text file and count words and word pairs. # Omit the first word in each line. Print a list of # word pairs and their bigram probability. The output # format for a word pair w1 w2 is: # # w2 w1 p(w2|w1) # #======================================================================== if { $argc < 1 || [lindex $argv 1] == "-help"} { puts stderr "USAGE: $argv0 'textfile' [min_prob]" exit } set filename [lindex $argv 0] if { $argc == 2} { set minProb [lindex $argv 1] } else { set minProb -1.0 } # --------------------- # define count procs # --------------------- proc count {array index} { upvar $array count if [info exists count($index)] { incr count($index) } else { set count($index) 1 } } # --------------------- # read text file # --------------------- set FP [open $filename r] while {[gets $FP line] >= 0} { set words [lrange $line 1 end] set prev count count1 "$prev" ;# count the begin marker in array count1 foreach word $words { count count1 "$word" ;# count the words in array count1 count count2 "$word,$prev" ;# count the word pairs in array count2 set prev $word } count count2 ",$prev" } close $FP # --------------------- # print sorted list # --------------------- foreach pair [lsort [array names count2]] { set words [split $pair ,] set word2 [lindex $words 0] set word1 [lindex $words 1] set prob [expr 1.0 * $count2($pair) / $count1($word1)] if {$prob >= $minProb} { puts "$word2 $word1 $prob" } } exit