diff --git a/DataModel.hs b/DataModel.hs index 7e438f6..ae175a1 100644 --- a/DataModel.hs +++ b/DataModel.hs @@ -11,8 +11,9 @@ class Freq a where frequency :: [ItemSet] -> a -> Frequency data Item = Item String deriving (Eq, Ord) + instance Show Item where - show (Item s) = s --"Item " ++ s + show (Item a) = show a data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) @@ -27,16 +28,23 @@ instance Freq ItemSet where count :: [ItemSet] -> ItemSet -> Count count table (ItemSet set) = - length (filter isSuperSet table) where + length (filter isSuperset table) where isSuperset (ItemSet row) = set `Set.isSubsetOf` row +difference :: ItemSet -> ItemSet -> ItemSet +difference (ItemSet set1) (ItemSet set2) = ItemSet (Set.difference set1 set2) + +empty :: ItemSet +empty = ItemSet (Set.fromList []) + data Rule = Rule ItemSet ItemSet deriving (Eq) instance Show Rule where show (Rule a b) = show a ++ "-> " ++ show b instance Freq Rule where - frequency table (Rule (ItemSet set1) (ItemSet set2)) = frequency table $ ItemSet (set1 `Set.union` set2) + frequency table (Rule (ItemSet set1) (ItemSet set2)) = frequency table $ + ItemSet (set1 `Set.union` set2) confidence :: [ItemSet] -> Rule -> Confidence confidence table (Rule x y) = frequency table (Rule x y) / frequency table x diff --git a/ExtractRules.hs b/ExtractRules.hs new file mode 100644 index 0000000..50b3727 --- /dev/null +++ b/ExtractRules.hs @@ -0,0 +1,32 @@ +module ExtractRules ( + extractRules +) where +import DataModel +import qualified Data.Set as Set +import qualified Data.Map as Map +import Data.Map(Map) + +extractRules :: Confidence -> [ItemSet] -> [Rule] +extractRules threshold patterns = filter (\x -> threshold <= confidence patterns x) rules where + rules = Map.foldrWithKey (\k v old -> ruleFromSubset v k : old) [] subsets + subsets = foldr (\x old -> insertMultiple (filteredPowerset x) x old) Map.empty patterns + filteredPowerset (ItemSet set) = map (ItemSet . Set.fromList) $ + filter (\val -> val /= Set.toList set && val /= []) $ powerset $ Set.toList set + +ruleFromSubset :: ItemSet -> ItemSet -> Rule +ruleFromSubset set subset = Rule subset (difference set subset) + + + +insertMultiple :: Ord k => [k] -> a -> Map k a -> Map k a +insertMultiple keys value m = foldr (\x old -> Map.insert x value old) m keys + +powerset :: [a] -> [[a]] +powerset [] = [[]] +powerset (x:xs) = xss /\/ map (x:) xss where + xss = powerset xs + +(/\/) :: [a] -> [a] -> [a] +[] /\/ ys = ys +(x:xs) /\/ ys = x : (ys /\/ xs) + diff --git a/FrequentPatterns.hs b/FrequentPatterns.hs index 2715887..09e0ba1 100644 --- a/FrequentPatterns.hs +++ b/FrequentPatterns.hs @@ -7,9 +7,12 @@ import Debug.Trace (traceShow) import qualified Data.List as List semiUnion :: ItemSet -> ItemSet -> ItemSet -semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where - max1 = Set.findMax set1 - max2 = Set.findMax set2 +semiUnion (ItemSet set1) (ItemSet set2) = ItemSet $ + if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 + then set1 `Set.union` set2 + else Set.empty where + max1 = Set.findMax set1 + max2 = Set.findMax set2 -- generate all possible combinations from a set of singletons -- generateLevels :: [Item] -> [[ItemSet]] @@ -19,15 +22,18 @@ semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete -- generate the next level in a bottom-up route generateNextLevel :: [ItemSet] -> [ItemSet] -generateNextLevel level = traceShow ("Computing level " ++ show (isSize (head level))) $ foldr (\value old -> generate value ++ old) [] level where - generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) - empty = ItemSet $ Set.fromList [] - isSize (ItemSet set) = Set.size set +generateNextLevel level = traceShow ("Computing level " ++ show (isSize (head level))) $ + foldr (\value old -> generate value ++ old) [] level where + generate value = takeWhile (/= empty) + (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) + isSize (ItemSet set) = Set.size set singletons :: [ItemSet] -> [Item] -singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where +singletons table = Set.toList $ foldr union (Set.fromList []) table where + union (ItemSet row) old = old `Set.union` row frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]] -frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where - firstLevel = filterByFrequency $ map (\x -> ItemSet $ Set.fromList [x]) (singletons table) - filterByFrequency = filter (\x -> frequency table x >= thresh) +frequentPatterns thresh table = until (\x -> [] == head x) + (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where + firstLevel = filterByFrequency $ map (\x -> ItemSet $ Set.fromList [x]) (singletons table) + filterByFrequency = filter (\x -> frequency table x >= thresh) diff --git a/phase1.hs b/phase1.hs index 8491cac..628ae72 100644 --- a/phase1.hs +++ b/phase1.hs @@ -13,6 +13,6 @@ main = do let threshold = read $ last args file <- readFile filename case parseCSV file of - Left _ -> error "Could not parse out.csv" - Right val -> mapM_ (\x -> putStrLn (show x ++ "(" ++ show (count table x) ++ ")")) (concat (frequentPatterns threshold table)) where - table = map (ItemSet. Set.fromList .map Item) val + Left _ -> error "Could not parse out.csv" + Right val -> mapM_ (\x -> putStrLn (show x ++ "(" ++ show (count table x) ++ ")")) (concat (frequentPatterns threshold table)) where + table = map (ItemSet. Set.fromList .map Item) val diff --git a/phase2.hs b/phase2.hs new file mode 100644 index 0000000..2226135 --- /dev/null +++ b/phase2.hs @@ -0,0 +1,20 @@ +import CSVParser +import FrequentPatterns +import DataModel +import ExtractRules +import qualified Data.Set as Set +import System.Environment(getArgs) +import Control.Monad(when) + +main :: IO() +main = do + args <- getArgs + when (2 /= length args) (error "Usage: phase2 ") + let filename = head args + let threshold = read $ last args + file <- readFile filename + case parseCSV file of + Left _ -> error "Could not parse out.csv" + Right val -> print $ extractRules threshold $ + filter (/= empty) $ concat $ frequentPatterns 0.01 table where + table = map (ItemSet. Set.fromList .map Item) val