rewrote the algorithm from scratch to implement semi-unions

2015-04-03 16:05:22 +02:00
parent 99613bf333
commit e66fad749f
1 changed files with 31 additions and 79 deletions
--- a/apriori.hs
+++ b/apriori.hs
@@ -1,12 +1,8 @@
 module Main where

-import System.Environment
-import qualified Data.List as List
 import Data.Set (Set)
 import qualified Data.Set as Set
-import Data.Map (Map)
-import qualified Data.Map as Map
-import Text.ParserCombinators.Parsec
+import qualified Data.List as List

 -- data structures defined here

@@ -16,85 +12,41 @@ instance Show Item where

 data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
 instance Show ItemSet where
-  show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> " " ++ x) . show) $ Set.toList x
-                         in y
+	show (ItemSet x) = let y = foldl (++) "" $ map show $ Set.toList x in y

-type Support = Double
-type Confidence = Double
-type Frequency = Int
+type Frequency = Double
+
+semiUnion :: ItemSet -> ItemSet -> ItemSet
+semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where
+	max1 = Set.findMax set1
+	max2 = Set.findMax set2
+
+-- generate all possible combinations from a set of singletons
+generateLevels :: [Item] -> [[ItemSet]]
+generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where
+	firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles
+	lastLevel = [ItemSet $ Set.fromList singles]
+
+generateNextLevel :: [ItemSet] -> [ItemSet]
+generateNextLevel level = foldr (\value old -> generate value ++ old) [] level where
+	generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) where
+		empty = ItemSet $ Set.fromList []

 frequency :: [ItemSet] -> ItemSet -> Frequency
-frequency iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
+frequency table (ItemSet set) = setCount / fromIntegral (length table) where
+	setCount = fromIntegral $ length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table)

-support :: [ItemSet] -> ItemSet -> Support
-support iss is = (fromIntegral $ frequency iss is) / (fromIntegral $ length iss)
-
-confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
-confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
-
-- parse CSV
-
-csvFile = endBy line eol
-line = do
-  cells <- sepBy cell $ char ','
-  return $ ItemSet $ Set.fromList cells
-cell = do
-  c <- many $ noneOf ",\r\n"
-  return $ Item $ c
-eol =   try (string "\n\r")
-    <|> try (string "\r\n")
-    <|> string "\n"
-    <|> string "\r"
-
-parseCSV :: String -> Either ParseError [ItemSet]
-parseCSV input = parse csvFile ("unknown") input
-
-- frequent items
-frequentItems :: [ItemSet] -> Support -> [Item]
-frequentItems x y = Set.elems $ Map.keysSet pi where
-  pi = Map.filter (>= frequency) m where
-  frequency = ceiling(y * (fromIntegral $ length x)) where
-  m = foldl count Map.empty x where
-  count m (ItemSet is) = Set.foldl f m is where
-  f m i = Map.insertWith acc i 1 m where
-  acc new old = old + 1
+singletons :: [ItemSet] -> [Item]
+singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where


-displayFrequentItems :: Map ItemSet Frequency -> String
-displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
-	itemToString is freq = "("++(show freq)++")" ++ (show is)
+frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]]
+frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where
+	firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table)
+	filterByFrequency = filter (\x -> frequency table x >= thresh)

-frequentItemSets :: [ItemSet] -> [ItemSet] -> Support -> Map ItemSet Frequency
-frequentItemSets r candidates threshold = Map.fromList $ map (\c -> (c, frequency r c)) filtered
-	where filtered = filter (\c -> support r c >= threshold) candidates

 main :: IO()
-main = do
-  putStrLn "Apriori Algorithm for frequent itemsets mining"
-  [f, s] <- getArgs
-  c <- readFile f
-  case parse csvFile "(stdin)" c of
-    Left e -> do putStrLn "Error parsing input: "
-                 print e
-    Right r -> do
-      let is = frequentItems r (read s)
-      let iss = frequentItemSets r (candidates is) (read s)
-      putStrLn $ displayFrequentItems iss
+main = print $ frequentPatterns 1 table where
+	table = [ItemSet (Set.fromList [Item "a", Item "b", Item "c"]), ItemSet (Set.fromList [Item "a", Item "b"])]

-- helper functions defined here
-
-allItems :: [ItemSet] -> [Item]
-allItems iss = Set.toList $ foldl add Set.empty iss where
-  add s (ItemSet is) =  Set.union s is
-
-candidates :: [Item] -> [ItemSet]
-candidates x = map (\old -> ItemSet (Set.fromList old)) (powerset x)
-
-powerset       :: [a] -> [[a]]
-powerset []     = [[]]
-powerset (x:xs) = xss /\/ map (x:) xss
-	where xss = powerset xs
-
-(/\/)        :: [a] -> [a] -> [a]
-[]     /\/ ys = ys
-(x:xs) /\/ ys = x : (ys /\/ xs)