Browse Source

cleaned code up a bit

master
Wesley Kerfoot 11 years ago
parent
commit
b1657484fa
  1. 26
      tokenize.hs

26
tokenize.hs

@ -4,10 +4,11 @@ import qualified Data.Map as Dm
import Control.Applicative import Control.Applicative
import Control.Monad import Control.Monad
ngrams' n len xs = let next = (take n xs) ngrams' n len xs =
in case len == n of let calcNext next
False -> (toLower <$> next) : (ngrams' n (len - 1) $ drop 1 xs) | len == n = return xs
_ -> return xs | otherwise = (toLower <$> next) : (ngrams' n (len - 1) $ drop 1 xs)
ngrams n xs = ngrams' n (length xs) xs ngrams n xs = ngrams' n (length xs) xs
digrams = ngrams 2 digrams = ngrams 2
@ -27,8 +28,9 @@ startsP' letter dgs = foldr check (0, 0) dgs where
_ -> (a, n + 1) _ -> (a, n + 1)
check (first:[]) (a, n) = (a, n + 1) check (first:[]) (a, n) = (a, n + 1)
startsP letter dgs = let (n, k) = startsP' letter (digrams dgs) startsP letter dgs =
in (fromIntegral n) / (fromIntegral k) let (n, k) = startsP' letter (digrams dgs)
in (fromIntegral n) / (fromIntegral k)
select [] = [] select [] = []
@ -58,10 +60,12 @@ out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ ","
-- first argument is all possible ngrams in a Map -- first argument is all possible ngrams in a Map
-- second argument is all of the tokenized ngrams from the corpus -- second argument is all of the tokenized ngrams from the corpus
ngramProbs k ngrams [] = (k, ngrams) ngramProbs k ngrams [] = (k, ngrams)
ngramProbs k ngrams (n:ns) = case (Dm.lookup n ngrams) of ngramProbs k ngrams (n:ns) =
Nothing -> ngramProbs k ngrams ns case (Dm.lookup n ngrams) of
(Just count) -> let ngrams' = Dm.insert n (count+1) ngrams Nothing -> ngramProbs k ngrams ns
in ngramProbs (k+1) ngrams' ns (Just count) ->
let ngrams' = Dm.insert n (count+1) ngrams
in ngramProbs (k+1) ngrams' ns
-- buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a -- buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a
buildProbabilities ngrams = Dm.fromList [(ngram, 0) | ngram <- ngrams] buildProbabilities ngrams = Dm.fromList [(ngram, 0) | ngram <- ngrams]
@ -71,4 +75,4 @@ main = do
let (n, ngramMap) = ngramProbs 0 (buildProbabilities englishQuintgrams) (quintgrams corpus) let (n, ngramMap) = ngramProbs 0 (buildProbabilities englishQuintgrams) (quintgrams corpus)
appendFile "./quadgrams.json" "{" appendFile "./quadgrams.json" "{"
mapM_ (out "./quadgrams.json" n) $ [(d,k) | (d,k) <- Dm.toList ngramMap, k /= 0] mapM_ (out "./quadgrams.json" n) $ [(d,k) | (d,k) <- Dm.toList ngramMap, k /= 0]
appendFile "./quadgrams.json" "}" appendFile "./quadgrams.json" "}"