From e66fad749ff6ccef0410c8f70948881929474b13 Mon Sep 17 00:00:00 2001 From: IGI-111 Date: Fri, 3 Apr 2015 16:05:22 +0200 Subject: [PATCH] rewrote the algorithm from scratch to implement semi-unions --- apriori.hs | 110 +++++++++++++++-------------------------------------- 1 file changed, 31 insertions(+), 79 deletions(-) diff --git a/apriori.hs b/apriori.hs index dd74c78..9f97805 100644 --- a/apriori.hs +++ b/apriori.hs @@ -1,100 +1,52 @@ module Main where -import System.Environment -import qualified Data.List as List import Data.Set (Set) import qualified Data.Set as Set -import Data.Map (Map) -import qualified Data.Map as Map -import Text.ParserCombinators.Parsec +import qualified Data.List as List -- data structures defined here data Item = Item String deriving (Eq, Ord) instance Show Item where - show (Item s) = s --"Item " ++ s + show (Item s) = s --"Item " ++ s data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) instance Show ItemSet where - show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> " " ++ x) . show) $ Set.toList x - in y + show (ItemSet x) = let y = foldl (++) "" $ map show $ Set.toList x in y -type Support = Double -type Confidence = Double -type Frequency = Int +type Frequency = Double + +semiUnion :: ItemSet -> ItemSet -> ItemSet +semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where + max1 = Set.findMax set1 + max2 = Set.findMax set2 + +-- generate all possible combinations from a set of singletons +generateLevels :: [Item] -> [[ItemSet]] +generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where + firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles + lastLevel = [ItemSet $ Set.fromList singles] + +generateNextLevel :: [ItemSet] -> [ItemSet] +generateNextLevel level = foldr (\value old -> generate value ++ old) [] level where + generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) where + empty = ItemSet $ Set.fromList [] frequency :: [ItemSet] -> ItemSet -> Frequency -frequency iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss +frequency table (ItemSet set) = setCount / fromIntegral (length table) where + setCount = fromIntegral $ length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table) -support :: [ItemSet] -> ItemSet -> Support -support iss is = (fromIntegral $ frequency iss is) / (fromIntegral $ length iss) - -confidence :: [ItemSet] -> ItemSet -> Item -> Confidence -confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i) - --- parse CSV - -csvFile = endBy line eol -line = do - cells <- sepBy cell $ char ',' - return $ ItemSet $ Set.fromList cells -cell = do - c <- many $ noneOf ",\r\n" - return $ Item $ c -eol = try (string "\n\r") - <|> try (string "\r\n") - <|> string "\n" - <|> string "\r" - -parseCSV :: String -> Either ParseError [ItemSet] -parseCSV input = parse csvFile ("unknown") input - --- frequent items -frequentItems :: [ItemSet] -> Support -> [Item] -frequentItems x y = Set.elems $ Map.keysSet pi where - pi = Map.filter (>= frequency) m where - frequency = ceiling(y * (fromIntegral $ length x)) where - m = foldl count Map.empty x where - count m (ItemSet is) = Set.foldl f m is where - f m i = Map.insertWith acc i 1 m where - acc new old = old + 1 +singletons :: [ItemSet] -> [Item] +singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where -displayFrequentItems :: Map ItemSet Frequency -> String -displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where - itemToString is freq = "("++(show freq)++")" ++ (show is) +frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]] +frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where + firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table) + filterByFrequency = filter (\x -> frequency table x >= thresh) -frequentItemSets :: [ItemSet] -> [ItemSet] -> Support -> Map ItemSet Frequency -frequentItemSets r candidates threshold = Map.fromList $ map (\c -> (c, frequency r c)) filtered - where filtered = filter (\c -> support r c >= threshold) candidates -main :: IO () -main = do - putStrLn "Apriori Algorithm for frequent itemsets mining" - [f, s] <- getArgs - c <- readFile f - case parse csvFile "(stdin)" c of - Left e -> do putStrLn "Error parsing input: " - print e - Right r -> do - let is = frequentItems r (read s) - let iss = frequentItemSets r (candidates is) (read s) - putStrLn $ displayFrequentItems iss +main :: IO() +main = print $ frequentPatterns 1 table where + table = [ItemSet (Set.fromList [Item "a", Item "b", Item "c"]), ItemSet (Set.fromList [Item "a", Item "b"])] --- helper functions defined here - -allItems :: [ItemSet] -> [Item] -allItems iss = Set.toList $ foldl add Set.empty iss where - add s (ItemSet is) = Set.union s is - -candidates :: [Item] -> [ItemSet] -candidates x = map (\old -> ItemSet (Set.fromList old)) (powerset x) - -powerset :: [a] -> [[a]] -powerset [] = [[]] -powerset (x:xs) = xss /\/ map (x:) xss - where xss = powerset xs - -(/\/) :: [a] -> [a] -> [a] -[] /\/ ys = ys -(x:xs) /\/ ys = x : (ys /\/ xs)