diff --git a/CSVParser.hs b/CSVParser.hs index 0fd325d..2027677 100644 --- a/CSVParser.hs +++ b/CSVParser.hs @@ -1,31 +1,31 @@ module CSVParser ( - parseCSV -)where + parseCSV + )where import Text.ParserCombinators.Parsec csvFile :: GenParser Char st [[String]] csvFile = do - result <- many line - eof - return result + result <- many line + eof + return result line :: GenParser Char st [String] line = do - result <- cells - _ <- eol - return result + result <- cells + _ <- eol + return result cells :: GenParser Char st [String] cells = do - first <- cellContent - next <- remainingCells - return (first : next) + first <- cellContent + next <- remainingCells + return (first : next) remainingCells :: GenParser Char st [String] remainingCells = - (char ',' >> cells) -- Found comma? More cells coming - <|> return [] -- No comma? Return [], no more cells + (char ',' >> cells) -- Found comma? More cells coming + <|> return [] -- No comma? Return [], no more cells cellContent :: GenParser Char st String cellContent = many (noneOf ",\n") diff --git a/DataModel.hs b/DataModel.hs index eff9c54..7ce2e48 100644 --- a/DataModel.hs +++ b/DataModel.hs @@ -8,31 +8,44 @@ type Frequency = Double type Confidence = Double class Freq a where - frequency :: [ItemSet] -> a -> Frequency + frequency :: [ItemSet] -> a -> Frequency data Item = Item String deriving (Eq, Ord) + instance Show Item where - show (Item s) = s --"Item " ++ s + show (Item a) = a + data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) instance Show ItemSet where - show (ItemSet x) = foldr ((\y old -> y ++ " " ++ old).show) "" (Set.toList x) + show (ItemSet x) = + init $ foldr ((\y old -> y ++ " " ++ old).show) "" (Set.toList x) instance Freq ItemSet where - frequency table (ItemSet set) = setCount / fromIntegral (length table) where - setCount = fromIntegral $ count table (ItemSet set) + frequency table (ItemSet set) = + setCount / fromIntegral (length table) where + setCount = fromIntegral $ count table (ItemSet set) count :: [ItemSet] -> ItemSet -> Count -count table (ItemSet set) = length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table) +count table (ItemSet set) = + length (filter isSuperset table) where + isSuperset (ItemSet row) = set `Set.isSubsetOf` row + +difference :: ItemSet -> ItemSet -> ItemSet +difference (ItemSet set1) (ItemSet set2) = ItemSet (Set.difference set1 set2) + +empty :: ItemSet +empty = ItemSet (Set.fromList []) data Rule = Rule ItemSet ItemSet deriving (Eq) instance Show Rule where - show (Rule a b) = show a ++ "-> " ++ show b + show (Rule a b) = show a ++ "->" ++ show b instance Freq Rule where - frequency table (Rule (ItemSet set1) (ItemSet set2)) = frequency table $ ItemSet (set1 `Set.union` set2) + frequency table (Rule (ItemSet set1) (ItemSet set2)) = frequency table $ + ItemSet (set1 `Set.union` set2) confidence :: [ItemSet] -> Rule -> Confidence confidence table (Rule x y) = frequency table (Rule x y) / frequency table x diff --git a/ExtractRules.hs b/ExtractRules.hs new file mode 100644 index 0000000..50b3727 --- /dev/null +++ b/ExtractRules.hs @@ -0,0 +1,32 @@ +module ExtractRules ( + extractRules +) where +import DataModel +import qualified Data.Set as Set +import qualified Data.Map as Map +import Data.Map(Map) + +extractRules :: Confidence -> [ItemSet] -> [Rule] +extractRules threshold patterns = filter (\x -> threshold <= confidence patterns x) rules where + rules = Map.foldrWithKey (\k v old -> ruleFromSubset v k : old) [] subsets + subsets = foldr (\x old -> insertMultiple (filteredPowerset x) x old) Map.empty patterns + filteredPowerset (ItemSet set) = map (ItemSet . Set.fromList) $ + filter (\val -> val /= Set.toList set && val /= []) $ powerset $ Set.toList set + +ruleFromSubset :: ItemSet -> ItemSet -> Rule +ruleFromSubset set subset = Rule subset (difference set subset) + + + +insertMultiple :: Ord k => [k] -> a -> Map k a -> Map k a +insertMultiple keys value m = foldr (\x old -> Map.insert x value old) m keys + +powerset :: [a] -> [[a]] +powerset [] = [[]] +powerset (x:xs) = xss /\/ map (x:) xss where + xss = powerset xs + +(/\/) :: [a] -> [a] -> [a] +[] /\/ ys = ys +(x:xs) /\/ ys = x : (ys /\/ xs) + diff --git a/FreqParser.hs b/FreqParser.hs new file mode 100644 index 0000000..62a8a1d --- /dev/null +++ b/FreqParser.hs @@ -0,0 +1,11 @@ +module ItemsParser ( + parseItems, + parseItemsWithFreq + )where +import qualified Data.Map as Map +import Data.Map(Map) +import DataModel + +parseItems :: String -> Either ParseError [ItemSet] + +parseItemsWithFreq :: String -> Either ParseError Map ItemSet Frequency diff --git a/FrequentPatterns.hs b/FrequentPatterns.hs index 1ee7eeb..891a14d 100644 --- a/FrequentPatterns.hs +++ b/FrequentPatterns.hs @@ -1,15 +1,18 @@ module FrequentPatterns ( - frequentPatterns -) where + frequentPatterns + ) where import DataModel import qualified Data.Set as Set -import Debug.Trace (traceShow) +import Debug.Trace (trace) import qualified Data.List as List semiUnion :: ItemSet -> ItemSet -> ItemSet -semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where - max1 = Set.findMax set1 - max2 = Set.findMax set2 +semiUnion (ItemSet set1) (ItemSet set2) = ItemSet $ + if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 + then set1 `Set.union` set2 + else Set.empty where + max1 = Set.findMax set1 + max2 = Set.findMax set2 -- generate all possible combinations from a set of singletons -- generateLevels :: [Item] -> [[ItemSet]] @@ -19,15 +22,19 @@ semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete -- generate the next level in a bottom-up route generateNextLevel :: [ItemSet] -> [ItemSet] -generateNextLevel level = traceShow ("Computing level " ++ show (isSize (head level))) $ foldr (\value old -> generate value ++ old) [] level where - generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) - empty = ItemSet $ Set.fromList [] - isSize (ItemSet set) = Set.size set +generateNextLevel level = trace ("Computing level " ++ show (isSize (head level))) $ + foldr (\value old -> generate value ++ old) [] level where + generate value = takeWhile (/= empty) + (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) + isSize (ItemSet set) = Set.size set singletons :: [ItemSet] -> [Item] -singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where +singletons table = Set.toList $ foldr union (Set.fromList []) table where + union (ItemSet row) old = old `Set.union` row frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]] -frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where - firstLevel = filterByFrequency $ map (\x -> ItemSet $ Set.fromList [x]) (singletons table) - filterByFrequency = filter (\x -> frequency table x >= thresh) +frequentPatterns thresh table = until (\x -> [] == head x) + (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where + firstLevel = filterByFrequency $ map (\x -> ItemSet $ Set.fromList [x]) $ + trace "Generated Singletons" (singletons table) + filterByFrequency = filter (\x -> frequency table x >= thresh) diff --git a/phase1.hs b/phase1.hs index 74eb09d..500617a 100644 --- a/phase1.hs +++ b/phase1.hs @@ -7,12 +7,22 @@ import Control.Monad(when) main :: IO() main = do - args <- getArgs - when (2 /= length args) (error "Usage: Main ") - let filename = head args - let threshold = read $ last args - file <- readFile filename - case parseCSV file of - Left _ -> error "Could not parse out.csv" - Right val -> mapM_ (\x -> putStrLn (show x ++ "(" ++ show (count table x) ++ ")")) (concat (frequentPatterns threshold table)) where - table = map (ItemSet. Set.fromList .map Item) val + args <- getArgs + when (2 > length args) + (error "Usage: Main ") + let filename = head args + let threshold = read $ args !! 1 + file <- readFile filename + case parseCSV file of + Left _ -> error "Could not parse out.csv" + Right val -> do + let table = map (ItemSet. Set.fromList .map Item) val + let freqPats = concat (frequentPatterns threshold table) + let output = formatToCSV table freqPats + putStrLn output + when (length args > 2) $ + writeFile (args !! 2) $ output + +formatToCSV :: [ItemSet] -> [ItemSet] -> String +formatToCSV table frequents = foldr (\x old -> old ++ formatRow x ++ "\n") "" frequents where + formatRow (ItemSet set) = init $ Set.foldr (\x old -> old ++ show x ++ ",") (show (count table (ItemSet set)) ++",") set diff --git a/phase2.hs b/phase2.hs new file mode 100644 index 0000000..7240594 --- /dev/null +++ b/phase2.hs @@ -0,0 +1,19 @@ +import CSVParser +import DataModel +import ExtractRules +import qualified Data.Set as Set +import System.Environment(getArgs) +import Control.Monad(when) + +main :: IO() +main = do + args <- getArgs + when (2 /= length args) (error "Usage: phase2 ") + let filename = head args + let threshold = read $ last args + file <- readFile filename + case parseCSV file of + Left _ -> error "Could not parse out.csv" + Right val -> do + let table = map (ItemSet. Set.fromList .map Item) val + print $ extractRules threshold table