commit 4d8fe0fada82e13f9a9b7c1abcebab9a00774f0f Author: Wesley Kerfoot Date: Thu Jan 24 21:20:03 2013 -0500 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/approximate.hs b/approximate.hs new file mode 100644 index 0000000..dbf5f01 --- /dev/null +++ b/approximate.hs @@ -0,0 +1,28 @@ +import qualified Control.Monad.Random as MR +import Text.JSON +import qualified Data.Map as Dm + +fromJSRational (s, (JSRational _ r)) = (s,fromRational r) + +ngramp ngJSON = let (Ok (JSObject result)) = decode ngJSON :: Result JSValue + in Dm.fromList $ map fromJSRational $ fromJSObject $ result + +frequencyMap [] ngmap = ngmap +frequencyMap ((ng, p):ngs) ngmap = case Dm.lookup (head ng) ngmap of + Nothing -> frequencyMap ngs (Dm.insert (head ng) [(ng, p)] ngmap) + Just xs -> frequencyMap ngs (Dm.insert (head ng) ((ng, p) : xs) ngmap) + + +generateText n acc cur freqmap = do + let (Just probabilities) = Dm.lookup cur freqmap + choice <- MR.fromList probabilities + let next = (head $ tail choice) + case n of + 0 -> return acc + _ -> generateText (n-1) (acc++([head choice])) next freqmap + +gibberish n = do + js <- readFile "./ngrams.json" + let pmap = ngramp js + result <- generateText n [] 't' (frequencyMap (Dm.toList pmap) (Dm.singleton '' [])) + return result \ No newline at end of file diff --git a/tokenize.hs b/tokenize.hs new file mode 100644 index 0000000..cb1ed9c --- /dev/null +++ b/tokenize.hs @@ -0,0 +1,64 @@ +import Data.List.Split +import Data.Char +import qualified Data.Map as Dm +import Control.Applicative +import Control.Monad + +ngrams' n len xs = let next = (take n xs) + in case len == n of + False -> (toLower <$> next) : (ngrams' n (len - 1) $ drop 1 xs) + _ -> return xs +ngrams n xs = ngrams' n (length xs) xs + +digrams = ngrams 2 +trigrams = ngrams 3 +-- start out by choosing a random unigram + +-- then choose another letter with the conditional probability that it follows the first one +-- will have to calculate the conditional probability that any letter follows any other letter + +-- see how many digrams start with that letter +startsP' :: Char -> [String] -> (Int, Int) +startsP' letter dgs = foldr check (0, 0) dgs where + check (first:second:[]) (a, n) = case (first == letter) of + True -> (a + 1, n + 1) + _ -> (a, n + 1) + check (first:[]) (a, n) = (a, n + 1) + +startsP letter dgs = let (n, k) = startsP' letter (digrams dgs) + in (fromIntegral n) / (fromIntegral k) + + +select [] = [] +select (x : xs) = (x, xs) : map (fmap (x :)) (select xs) + +perm2 ks = [[x,y] | (x, ys) <- select ks, y <- ys] + +permute 2 xs = perm2 xs +permute n xs = join [[x : p | p <- (permute (n-1) ys)] | (x,ys) <- select xs] + +alphabet = ((map return ".,\" :;?![]()") ++ (map return ['a'..'z'])) ++ (map (\x -> [x,x]) ['a'..'z']) + +englishDigrams = permute 2 alphabet + +englishTrigrams = permute 3 alphabet + +out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ "," + +digramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a) +-- first argument is tokenized digrams +-- seconds argument is all possible digrams +digramProbs n ds [] = (n, ds) +digramProbs n digrams (d:ds) = case (Dm.lookup d digrams) of + Nothing -> digramProbs n digrams ds + (Just count) -> let digrams' = Dm.insert d (count+1) digrams + in digramProbs (n+1) digrams' ds + +buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a +buildProbabilities digrams = Dm.fromList [(digram, 0.0) | digram <- digrams] + +-- main = do + -- corpus <- readFile "./xab" + -- let (n, digramMap) = digramProbs 0 (buildProbabilities englishTrigrams) (trigrams corpus) + -- mapM_ (out "./test2.json" n) $ Dm.toList digramMap + \ No newline at end of file