diff --git a/apriori.hs b/apriori.hs new file mode 100644 index 0000000..869bfbf --- /dev/null +++ b/apriori.hs @@ -0,0 +1,93 @@ +module Main where + +import System.Environment +import qualified Data.List as List +import Data.Set (Set) +import qualified Data.Set as Set +import Data.Map (Map) +import qualified Data.Map as Map +import Text.ParserCombinators.Parsec + +-- data structures defined here + +data Item = Item String deriving (Eq, Ord) +instance Show Item where + show (Item s) = "Item " ++ s + +data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) +instance Show ItemSet where + show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> x ++ ",") . show) $ Set.toList x + in "{" ++ y ++ "}" + +type Support = Double +type Confidence = Double + +support :: [ItemSet] -> ItemSet -> Support +support iss is = (fromIntegral $ count iss is) / (fromIntegral $ length iss) + where + count iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss + +confidence :: [ItemSet] -> ItemSet -> Item -> Confidence +confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i) + +-- parse CSV + +csvFile = endBy line eol +line = do + cells <- sepBy cell $ char ',' + return $ ItemSet $ Set.fromList cells +cell = do + c <- many $ noneOf ",\r\n" + return $ Item $ c +eol = try (string "\n\r") + <|> try (string "\r\n") + <|> string "\n" + <|> string "\r" + +parseCSV :: String -> Either ParseError [ItemSet] +parseCSV input = parse csvFile ("unknown") input + +-- frequent items +frequentItems :: [ItemSet] -> Support -> [(Item, Integer)] +frequentItems x y = Set.toList $ Map.keysSet pi frequency where + pi = Map.filter (>= frequency) m where + frequency = ceiling(y * (fromIntegral $ length x)) where + m = foldl count Map.empty x where + count m (ItemSet is) = Set.foldl f m is where + f m i = Map.insertWith acc i 1 m where + acc new old = old + 1 + +-- frequent itemsets +-- frequentItemSets :: [ItemSet] -> [Item] -> Support -> [ItemSet] + +main :: IO () +main = do + putStrLn "Apriori Algorithm for frequent itemsets mining" + [f, s] <- getArgs + c <- readFile f + case parse csvFile "(stdin)" c of + Left e -> do putStrLn "Error parsing input: " + print e + Right r -> do + let i = frequentItems r (read s) + mapM_ print $ i + +-- helper functions defined here + +trim :: [Char] -> [Char] +trim = trimFront . trimBack + +trimFront :: [Char] -> [Char] +trimFront (x:xs) = case x of + ' ' -> trim xs + _ -> (x:xs) + +trimBack :: [Char] -> [Char] +trimBack x = reverse $ trimFront $ reverse x + +allItems :: [ItemSet] -> [Item] +allItems iss = Set.toList $ foldl add Set.empty iss where + add s (ItemSet is) = Set.union s is + +candidate :: [ItemSet] -> [Item] -> [ItemSet] +candidate x y = List.nub $ [ItemSet (Set.union (Set.singleton b) a) | (ItemSet a) <- x, b <- y, Set.notMember b a]