|
@ -12,6 +12,8 @@ ngrams n xs = ngrams' n (length xs) xs |
|
|
|
|
|
|
|
|
digrams = ngrams 2 |
|
|
digrams = ngrams 2 |
|
|
trigrams = ngrams 3 |
|
|
trigrams = ngrams 3 |
|
|
|
|
|
quadgrams = ngrams 4 |
|
|
|
|
|
quintgrams = ngrams 5 |
|
|
-- start out by choosing a random unigram |
|
|
-- start out by choosing a random unigram |
|
|
|
|
|
|
|
|
-- then choose another letter with the conditional probability that it follows the first one |
|
|
-- then choose another letter with the conditional probability that it follows the first one |
|
@ -34,31 +36,39 @@ select (x : xs) = (x, xs) : map (fmap (x :)) (select xs) |
|
|
|
|
|
|
|
|
perm2 ks = [[x,y] | (x, ys) <- select ks, y <- ys] |
|
|
perm2 ks = [[x,y] | (x, ys) <- select ks, y <- ys] |
|
|
|
|
|
|
|
|
|
|
|
permute 1 xs = map (:[]) xs |
|
|
permute 2 xs = perm2 xs |
|
|
permute 2 xs = perm2 xs |
|
|
permute n xs = join [[x : p | p <- (permute (n-1) ys)] | (x,ys) <- select xs] |
|
|
permute n xs = join [[x : p | p <- (permute (n-1) ys)] | (x,ys) <- select xs] |
|
|
|
|
|
|
|
|
alphabet = ((map return ".,\" :;?![]()") ++ (map return ['a'..'z'])) ++ (map (\x -> [x,x]) ['a'..'z']) |
|
|
alphabet = ".,\" :;?![]()" ++ ['a'..'z'] |
|
|
|
|
|
|
|
|
englishDigrams = permute 2 alphabet |
|
|
repeatLetters n = map $ take n . repeat |
|
|
|
|
|
|
|
|
englishTrigrams = permute 3 alphabet |
|
|
englishDigrams = (repeatLetters 2 alphabet) ++ permute 2 alphabet |
|
|
|
|
|
|
|
|
out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ "," |
|
|
englishTrigrams = (repeatLetters 3 alphabet) ++ permute 3 alphabet |
|
|
|
|
|
|
|
|
digramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a) |
|
|
englishQuadgrams = (repeatLetters 4 alphabet) ++ permute 4 alphabet |
|
|
-- first argument is tokenized digrams |
|
|
|
|
|
-- seconds argument is all possible digrams |
|
|
|
|
|
digramProbs n ds [] = (n, ds) |
|
|
|
|
|
digramProbs n digrams (d:ds) = case (Dm.lookup d digrams) of |
|
|
|
|
|
Nothing -> digramProbs n digrams ds |
|
|
|
|
|
(Just count) -> let digrams' = Dm.insert d (count+1) digrams |
|
|
|
|
|
in digramProbs (n+1) digrams' ds |
|
|
|
|
|
|
|
|
|
|
|
buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a |
|
|
englishQuintgrams = (repeatLetters 5 alphabet) ++ permute 5 alphabet |
|
|
buildProbabilities digrams = Dm.fromList [(digram, 0.0) | digram <- digrams] |
|
|
|
|
|
|
|
|
|
|
|
-- main = do |
|
|
out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ "," |
|
|
-- corpus <- readFile "./xab" |
|
|
|
|
|
-- let (n, digramMap) = digramProbs 0 (buildProbabilities englishTrigrams) (trigrams corpus) |
|
|
|
|
|
-- mapM_ (out "./test2.json" n) $ Dm.toList digramMap |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-- ngramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a) |
|
|
|
|
|
-- first argument is all possible ngrams in a Map |
|
|
|
|
|
-- second argument is all of the tokenized ngrams from the corpus |
|
|
|
|
|
ngramProbs k ngrams [] = (k, ngrams) |
|
|
|
|
|
ngramProbs k ngrams (n:ns) = case (Dm.lookup n ngrams) of |
|
|
|
|
|
Nothing -> ngramProbs k ngrams ns |
|
|
|
|
|
(Just count) -> let ngrams' = Dm.insert n (count+1) ngrams |
|
|
|
|
|
in ngramProbs (k+1) ngrams' ns |
|
|
|
|
|
|
|
|
|
|
|
-- buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a |
|
|
|
|
|
buildProbabilities ngrams = Dm.fromList [(ngram, 0) | ngram <- ngrams] |
|
|
|
|
|
|
|
|
|
|
|
main = do |
|
|
|
|
|
corpus <- readFile "./xab" |
|
|
|
|
|
let (n, ngramMap) = ngramProbs 0 (buildProbabilities englishQuintgrams) (quintgrams corpus) |
|
|
|
|
|
appendFile "./quadgrams.json" "{" |
|
|
|
|
|
mapM_ (out "./quadgrams.json" n) $ [(d,k) | (d,k) <- Dm.toList ngramMap, k /= 0] |
|
|
|
|
|
appendFile "./quadgrams.json" "}" |