From 967c5ea725819e1c1d64825cbaa17470d45efdf8 Mon Sep 17 00:00:00 2001 From: Wesley Kerfoot Date: Sun, 3 Feb 2013 21:53:22 -0500 Subject: [PATCH] using bytestrings --- approximate.hs | 11 +++++++---- tokenize.hs | 50 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/approximate.hs b/approximate.hs index 2780928..2662665 100644 --- a/approximate.hs +++ b/approximate.hs @@ -1,7 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} import Control.Monad -import Control.Applicative import qualified Control.Monad.Random as MR import Data.Aeson import Data.Aeson.Types @@ -10,6 +9,7 @@ import qualified Data.ByteString.Lazy as DB import Data.Char frequencyMap [] ngmap = ngmap +frequencyMap ((_, 0.0):ngs) ngmap = frequencyMap ngs ngmap frequencyMap ((ng, p):ngs) ngmap = case Dm.lookup (DB.head ng) ngmap of Nothing -> frequencyMap ngs (Dm.insert (DB.head ng) [(ng, p)] ngmap) Just xs -> frequencyMap ngs (Dm.insert (DB.head ng) ((ng, p) : xs) ngmap) @@ -30,6 +30,9 @@ generateText k n acc cur freqmap = do gibberish n = do js <- (DB.readFile "./quadgrams.json") let (Just ngramPairs) = decode js :: Maybe (Dm.Map DB.ByteString Rational) - result <- generateText 4 n "" 32 (frequencyMap (Dm.toList ngramPairs) (Dm.singleton 32 [])) - print result -main = gibberish 500 \ No newline at end of file + result <- generateText 4 n "" 116 (frequencyMap (Dm.toList ngramPairs) (Dm.singleton 0 [])) + return result + +main = do + output <- gibberish 1550 + DB.writeFile "./output3.txt" output \ No newline at end of file diff --git a/tokenize.hs b/tokenize.hs index cb1ed9c..975d699 100644 --- a/tokenize.hs +++ b/tokenize.hs @@ -12,6 +12,8 @@ ngrams n xs = ngrams' n (length xs) xs digrams = ngrams 2 trigrams = ngrams 3 +quadgrams = ngrams 4 +quintgrams = ngrams 5 -- start out by choosing a random unigram -- then choose another letter with the conditional probability that it follows the first one @@ -34,31 +36,39 @@ select (x : xs) = (x, xs) : map (fmap (x :)) (select xs) perm2 ks = [[x,y] | (x, ys) <- select ks, y <- ys] +permute 1 xs = map (:[]) xs permute 2 xs = perm2 xs permute n xs = join [[x : p | p <- (permute (n-1) ys)] | (x,ys) <- select xs] -alphabet = ((map return ".,\" :;?![]()") ++ (map return ['a'..'z'])) ++ (map (\x -> [x,x]) ['a'..'z']) +alphabet = ".,\" :;?![]()" ++ ['a'..'z'] -englishDigrams = permute 2 alphabet +repeatLetters n = map $ take n . repeat -englishTrigrams = permute 3 alphabet +englishDigrams = (repeatLetters 2 alphabet) ++ permute 2 alphabet + +englishTrigrams = (repeatLetters 3 alphabet) ++ permute 3 alphabet + +englishQuadgrams = (repeatLetters 4 alphabet) ++ permute 4 alphabet + +englishQuintgrams = (repeatLetters 5 alphabet) ++ permute 5 alphabet out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ "," -digramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a) --- first argument is tokenized digrams --- seconds argument is all possible digrams -digramProbs n ds [] = (n, ds) -digramProbs n digrams (d:ds) = case (Dm.lookup d digrams) of - Nothing -> digramProbs n digrams ds - (Just count) -> let digrams' = Dm.insert d (count+1) digrams - in digramProbs (n+1) digrams' ds - -buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a -buildProbabilities digrams = Dm.fromList [(digram, 0.0) | digram <- digrams] - --- main = do - -- corpus <- readFile "./xab" - -- let (n, digramMap) = digramProbs 0 (buildProbabilities englishTrigrams) (trigrams corpus) - -- mapM_ (out "./test2.json" n) $ Dm.toList digramMap - \ No newline at end of file +-- ngramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a) +-- first argument is all possible ngrams in a Map +-- second argument is all of the tokenized ngrams from the corpus +ngramProbs k ngrams [] = (k, ngrams) +ngramProbs k ngrams (n:ns) = case (Dm.lookup n ngrams) of + Nothing -> ngramProbs k ngrams ns + (Just count) -> let ngrams' = Dm.insert n (count+1) ngrams + in ngramProbs (k+1) ngrams' ns + +-- buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a +buildProbabilities ngrams = Dm.fromList [(ngram, 0) | ngram <- ngrams] + +main = do + corpus <- readFile "./xab" + let (n, ngramMap) = ngramProbs 0 (buildProbabilities englishQuintgrams) (quintgrams corpus) + appendFile "./quadgrams.json" "{" + mapM_ (out "./quadgrams.json" n) $ [(d,k) | (d,k) <- Dm.toList ngramMap, k /= 0] + appendFile "./quadgrams.json" "}" \ No newline at end of file