diff --git a/Apriori.hs b/Apriori.hs new file mode 100644 index 0000000..778f09e --- /dev/null +++ b/Apriori.hs @@ -0,0 +1,56 @@ +module Apriori where + +import Data.Set (Set) +import qualified Data.Set as Set +import qualified Data.List as List +import Debug.Trace (traceShow) + +data Item = Item String deriving (Eq, Ord) +instance Show Item where + show (Item s) = s --"Item " ++ s + +data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) +instance Show ItemSet where + show (ItemSet x) = foldr ((\y old -> y ++ " " ++ old).show) "" (Set.toList x) + +data Rule = Rule ItemSet ItemSet deriving (Eq) +instance Show Rule where + show (Rule a b) = show a ++ "-> " ++ show b + +type Frequency = Double + +type Count = Int + +frequency :: [ItemSet] -> ItemSet -> Frequency +frequency table (ItemSet set) = setCount / fromIntegral (length table) where + setCount = fromIntegral $ count table (ItemSet set) + + +semiUnion :: ItemSet -> ItemSet -> ItemSet +semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where + max1 = Set.findMax set1 + max2 = Set.findMax set2 + +-- generate all possible combinations from a set of singletons +generateLevels :: [Item] -> [[ItemSet]] +generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where + firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles + lastLevel = [ItemSet $ Set.fromList singles] + +-- generate the next level in a bottom-up route +generateNextLevel :: [ItemSet] -> [ItemSet] +generateNextLevel level = traceShow ("Computing level " ++ show (isSize (head level))) $ foldr (\value old -> generate value ++ old) [] level where + generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) + empty = ItemSet $ Set.fromList [] + isSize (ItemSet set) = Set.size set + +count :: [ItemSet] -> ItemSet -> Count +count table (ItemSet set) = length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table) + +singletons :: [ItemSet] -> [Item] +singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where + +frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]] +frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where + firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table) + filterByFrequency = filter (\x -> frequency table x >= thresh) diff --git a/CSVParser.hs b/CSVParser.hs index e353ec3..0fd325d 100644 --- a/CSVParser.hs +++ b/CSVParser.hs @@ -2,7 +2,6 @@ module CSVParser ( parseCSV )where - import Text.ParserCombinators.Parsec csvFile :: GenParser Char st [[String]] @@ -29,11 +28,10 @@ remainingCells = <|> return [] -- No comma? Return [], no more cells cellContent :: GenParser Char st String -cellContent = - many (noneOf ",\n") +cellContent = many (noneOf ",\n") eol :: GenParser Char st Char eol = char '\n' parseCSV :: String -> Either ParseError [[String]] -parseCSV input = parse csvFile "(unknown)" input +parseCSV = parse csvFile "(unknown)" diff --git a/Main.hs b/Main.hs index d9e88a2..5de1d1a 100644 --- a/Main.hs +++ b/Main.hs @@ -1,61 +1,8 @@ module Main where -import Data.Set (Set) -import qualified Data.Set as Set -import qualified Data.List as List -import Debug.Trace (traceShow) import CSVParser - --- data structures defined here - -data Item = Item String deriving (Eq, Ord) -instance Show Item where - show (Item s) = s --"Item " ++ s - -data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) -instance Show ItemSet where - show (ItemSet x) = foldr ((\y old -> y ++ " " ++ old).show) "" (Set.toList x) - -data Rule = Rule ItemSet ItemSet deriving (Eq) -instance Show Rule where - show (Rule a b) = show a ++ "-> " ++ show b - - -type Frequency = Double -type Count = Int - -semiUnion :: ItemSet -> ItemSet -> ItemSet -semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where - max1 = Set.findMax set1 - max2 = Set.findMax set2 - --- generate all possible combinations from a set of singletons -generateLevels :: [Item] -> [[ItemSet]] -generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where - firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles - lastLevel = [ItemSet $ Set.fromList singles] - -generateNextLevel :: [ItemSet] -> [ItemSet] -generateNextLevel level = traceShow ("Computing level " ++ show (isSize (head level))) $ foldr (\value old -> generate value ++ old) [] level where - generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) - empty = ItemSet $ Set.fromList [] - isSize (ItemSet set) = Set.size set - -frequency :: [ItemSet] -> ItemSet -> Frequency -frequency table (ItemSet set) = setCount / fromIntegral (length table) where - setCount = fromIntegral $ count table (ItemSet set) - -count :: [ItemSet] -> ItemSet -> Count -count table (ItemSet set) = length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table) - -singletons :: [ItemSet] -> [Item] -singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where - -frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]] -frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where - firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table) - filterByFrequency = filter (\x -> frequency table x >= thresh) - +import Apriori +import qualified Data.Set as Set main :: IO() main = do