Files
GitMining/apriori.hs
2015-04-02 08:38:59 +02:00

94 lines
2.7 KiB
Haskell

module Main where
import System.Environment
import qualified Data.List as List
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Map (Map)
import qualified Data.Map as Map
import Text.ParserCombinators.Parsec
-- data structures defined here
data Item = Item String deriving (Eq, Ord)
instance Show Item where
show (Item s) = "Item " ++ s
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
instance Show ItemSet where
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> x ++ ",") . show) $ Set.toList x
in "{" ++ y ++ "}"
type Support = Double
type Confidence = Double
support :: [ItemSet] -> ItemSet -> Support
support iss is = (fromIntegral $ count iss is) / (fromIntegral $ length iss)
where
count iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
-- parse CSV
csvFile = endBy line eol
line = do
cells <- sepBy cell $ char ','
return $ ItemSet $ Set.fromList cells
cell = do
c <- many $ noneOf ",\r\n"
return $ Item $ c
eol = try (string "\n\r")
<|> try (string "\r\n")
<|> string "\n"
<|> string "\r"
parseCSV :: String -> Either ParseError [ItemSet]
parseCSV input = parse csvFile ("unknown") input
-- frequent items
frequentItems :: Integral a => [ItemSet] -> Support -> Map Item a
frequentItems x y = pi where
pi = Map.filter (>= frequency) m where
frequency = ceiling(y * (fromIntegral $ length x)) where
m = foldl count Map.empty x where
count m (ItemSet is) = Set.foldl f m is where
f m i = Map.insertWith acc i 1 m where
acc new old = old + 1
-- frequent itemsets
-- frequentItemSets :: [ItemSet] -> [Item] -> Support -> [ItemSet]
main :: IO ()
main = do
putStrLn "Apriori Algorithm for frequent itemsets mining"
[f, s] <- getArgs
c <- readFile f
case parse csvFile "(stdin)" c of
Left e -> do putStrLn "Error parsing input: "
print e
Right r -> do
let i = frequentItems r (read s)
print $ i
-- helper functions defined here
trim :: [Char] -> [Char]
trim = trimFront . trimBack
trimFront :: [Char] -> [Char]
trimFront (x:xs) = case x of
' ' -> trim xs
_ -> (x:xs)
trimBack :: [Char] -> [Char]
trimBack x = reverse $ trimFront $ reverse x
allItems :: [ItemSet] -> [Item]
allItems iss = Set.toList $ foldl add Set.empty iss where
add s (ItemSet is) = Set.union s is
candidate :: [ItemSet] -> [Item] -> [ItemSet]
candidate x y = List.nub $ [ItemSet (Set.union (Set.singleton b) a) | (ItemSet a) <- x, b <- y, Set.notMember b a]