Browse Source

using bytestrings

master
Wesley Kerfoot 11 years ago
parent
commit
967c5ea725
  1. 11
      approximate.hs
  2. 50
      tokenize.hs

11
approximate.hs

@ -1,7 +1,6 @@
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad
import Control.Applicative
import qualified Control.Monad.Random as MR
import Data.Aeson
import Data.Aeson.Types
@ -10,6 +9,7 @@ import qualified Data.ByteString.Lazy as DB
import Data.Char
frequencyMap [] ngmap = ngmap
frequencyMap ((_, 0.0):ngs) ngmap = frequencyMap ngs ngmap
frequencyMap ((ng, p):ngs) ngmap = case Dm.lookup (DB.head ng) ngmap of
Nothing -> frequencyMap ngs (Dm.insert (DB.head ng) [(ng, p)] ngmap)
Just xs -> frequencyMap ngs (Dm.insert (DB.head ng) ((ng, p) : xs) ngmap)
@ -30,6 +30,9 @@ generateText k n acc cur freqmap = do
gibberish n = do
js <- (DB.readFile "./quadgrams.json")
let (Just ngramPairs) = decode js :: Maybe (Dm.Map DB.ByteString Rational)
result <- generateText 4 n "" 32 (frequencyMap (Dm.toList ngramPairs) (Dm.singleton 32 []))
print result
main = gibberish 500
result <- generateText 4 n "" 116 (frequencyMap (Dm.toList ngramPairs) (Dm.singleton 0 []))
return result
main = do
output <- gibberish 1550
DB.writeFile "./output3.txt" output

50
tokenize.hs

@ -12,6 +12,8 @@ ngrams n xs = ngrams' n (length xs) xs
digrams = ngrams 2
trigrams = ngrams 3
quadgrams = ngrams 4
quintgrams = ngrams 5
-- start out by choosing a random unigram
-- then choose another letter with the conditional probability that it follows the first one
@ -34,31 +36,39 @@ select (x : xs) = (x, xs) : map (fmap (x :)) (select xs)
perm2 ks = [[x,y] | (x, ys) <- select ks, y <- ys]
permute 1 xs = map (:[]) xs
permute 2 xs = perm2 xs
permute n xs = join [[x : p | p <- (permute (n-1) ys)] | (x,ys) <- select xs]
alphabet = ((map return ".,\" :;?![]()") ++ (map return ['a'..'z'])) ++ (map (\x -> [x,x]) ['a'..'z'])
alphabet = ".,\" :;?![]()" ++ ['a'..'z']
englishDigrams = permute 2 alphabet
repeatLetters n = map $ take n . repeat
englishTrigrams = permute 3 alphabet
englishDigrams = (repeatLetters 2 alphabet) ++ permute 2 alphabet
englishTrigrams = (repeatLetters 3 alphabet) ++ permute 3 alphabet
englishQuadgrams = (repeatLetters 4 alphabet) ++ permute 4 alphabet
englishQuintgrams = (repeatLetters 5 alphabet) ++ permute 5 alphabet
out fname n (d, k) = appendFile fname $ (show d) ++ ":" ++ (show $ k/n) ++ ","
digramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a)
-- first argument is tokenized digrams
-- seconds argument is all possible digrams
digramProbs n ds [] = (n, ds)
digramProbs n digrams (d:ds) = case (Dm.lookup d digrams) of
Nothing -> digramProbs n digrams ds
(Just count) -> let digrams' = Dm.insert d (count+1) digrams
in digramProbs (n+1) digrams' ds
buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a
buildProbabilities digrams = Dm.fromList [(digram, 0.0) | digram <- digrams]
-- main = do
-- corpus <- readFile "./xab"
-- let (n, digramMap) = digramProbs 0 (buildProbabilities englishTrigrams) (trigrams corpus)
-- mapM_ (out "./test2.json" n) $ Dm.toList digramMap
-- ngramProbs :: (Num a, Num t, Ord k) => t -> Dm.Map k a -> [k] -> (t, Dm.Map k a)
-- first argument is all possible ngrams in a Map
-- second argument is all of the tokenized ngrams from the corpus
ngramProbs k ngrams [] = (k, ngrams)
ngramProbs k ngrams (n:ns) = case (Dm.lookup n ngrams) of
Nothing -> ngramProbs k ngrams ns
(Just count) -> let ngrams' = Dm.insert n (count+1) ngrams
in ngramProbs (k+1) ngrams' ns
-- buildProbabilities :: (Fractional a, Ord k) => [k] -> Dm.Map k a
buildProbabilities ngrams = Dm.fromList [(ngram, 0) | ngram <- ngrams]
main = do
corpus <- readFile "./xab"
let (n, ngramMap) = ngramProbs 0 (buildProbabilities englishQuintgrams) (quintgrams corpus)
appendFile "./quadgrams.json" "{"
mapM_ (out "./quadgrams.json" n) $ [(d,k) | (d,k) <- Dm.toList ngramMap, k /= 0]
appendFile "./quadgrams.json" "}"