rewrote the algorithm from scratch to implement semi-unions
This commit is contained in:
106
apriori.hs
106
apriori.hs
@@ -1,12 +1,8 @@
|
||||
module Main where
|
||||
|
||||
import System.Environment
|
||||
import qualified Data.List as List
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as Set
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map
|
||||
import Text.ParserCombinators.Parsec
|
||||
import qualified Data.List as List
|
||||
|
||||
-- data structures defined here
|
||||
|
||||
@@ -16,85 +12,41 @@ instance Show Item where
|
||||
|
||||
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
|
||||
instance Show ItemSet where
|
||||
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> " " ++ x) . show) $ Set.toList x
|
||||
in y
|
||||
show (ItemSet x) = let y = foldl (++) "" $ map show $ Set.toList x in y
|
||||
|
||||
type Support = Double
|
||||
type Confidence = Double
|
||||
type Frequency = Int
|
||||
type Frequency = Double
|
||||
|
||||
semiUnion :: ItemSet -> ItemSet -> ItemSet
|
||||
semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where
|
||||
max1 = Set.findMax set1
|
||||
max2 = Set.findMax set2
|
||||
|
||||
-- generate all possible combinations from a set of singletons
|
||||
generateLevels :: [Item] -> [[ItemSet]]
|
||||
generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where
|
||||
firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles
|
||||
lastLevel = [ItemSet $ Set.fromList singles]
|
||||
|
||||
generateNextLevel :: [ItemSet] -> [ItemSet]
|
||||
generateNextLevel level = foldr (\value old -> generate value ++ old) [] level where
|
||||
generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) where
|
||||
empty = ItemSet $ Set.fromList []
|
||||
|
||||
frequency :: [ItemSet] -> ItemSet -> Frequency
|
||||
frequency iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
|
||||
frequency table (ItemSet set) = setCount / fromIntegral (length table) where
|
||||
setCount = fromIntegral $ length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table)
|
||||
|
||||
support :: [ItemSet] -> ItemSet -> Support
|
||||
support iss is = (fromIntegral $ frequency iss is) / (fromIntegral $ length iss)
|
||||
|
||||
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
|
||||
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
|
||||
|
||||
-- parse CSV
|
||||
|
||||
csvFile = endBy line eol
|
||||
line = do
|
||||
cells <- sepBy cell $ char ','
|
||||
return $ ItemSet $ Set.fromList cells
|
||||
cell = do
|
||||
c <- many $ noneOf ",\r\n"
|
||||
return $ Item $ c
|
||||
eol = try (string "\n\r")
|
||||
<|> try (string "\r\n")
|
||||
<|> string "\n"
|
||||
<|> string "\r"
|
||||
|
||||
parseCSV :: String -> Either ParseError [ItemSet]
|
||||
parseCSV input = parse csvFile ("unknown") input
|
||||
|
||||
-- frequent items
|
||||
frequentItems :: [ItemSet] -> Support -> [Item]
|
||||
frequentItems x y = Set.elems $ Map.keysSet pi where
|
||||
pi = Map.filter (>= frequency) m where
|
||||
frequency = ceiling(y * (fromIntegral $ length x)) where
|
||||
m = foldl count Map.empty x where
|
||||
count m (ItemSet is) = Set.foldl f m is where
|
||||
f m i = Map.insertWith acc i 1 m where
|
||||
acc new old = old + 1
|
||||
singletons :: [ItemSet] -> [Item]
|
||||
singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where
|
||||
|
||||
|
||||
displayFrequentItems :: Map ItemSet Frequency -> String
|
||||
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
|
||||
itemToString is freq = "("++(show freq)++")" ++ (show is)
|
||||
frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]]
|
||||
frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where
|
||||
firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table)
|
||||
filterByFrequency = filter (\x -> frequency table x >= thresh)
|
||||
|
||||
frequentItemSets :: [ItemSet] -> [ItemSet] -> Support -> Map ItemSet Frequency
|
||||
frequentItemSets r candidates threshold = Map.fromList $ map (\c -> (c, frequency r c)) filtered
|
||||
where filtered = filter (\c -> support r c >= threshold) candidates
|
||||
|
||||
main :: IO()
|
||||
main = do
|
||||
putStrLn "Apriori Algorithm for frequent itemsets mining"
|
||||
[f, s] <- getArgs
|
||||
c <- readFile f
|
||||
case parse csvFile "(stdin)" c of
|
||||
Left e -> do putStrLn "Error parsing input: "
|
||||
print e
|
||||
Right r -> do
|
||||
let is = frequentItems r (read s)
|
||||
let iss = frequentItemSets r (candidates is) (read s)
|
||||
putStrLn $ displayFrequentItems iss
|
||||
main = print $ frequentPatterns 1 table where
|
||||
table = [ItemSet (Set.fromList [Item "a", Item "b", Item "c"]), ItemSet (Set.fromList [Item "a", Item "b"])]
|
||||
|
||||
-- helper functions defined here
|
||||
|
||||
allItems :: [ItemSet] -> [Item]
|
||||
allItems iss = Set.toList $ foldl add Set.empty iss where
|
||||
add s (ItemSet is) = Set.union s is
|
||||
|
||||
candidates :: [Item] -> [ItemSet]
|
||||
candidates x = map (\old -> ItemSet (Set.fromList old)) (powerset x)
|
||||
|
||||
powerset :: [a] -> [[a]]
|
||||
powerset [] = [[]]
|
||||
powerset (x:xs) = xss /\/ map (x:) xss
|
||||
where xss = powerset xs
|
||||
|
||||
(/\/) :: [a] -> [a] -> [a]
|
||||
[] /\/ ys = ys
|
||||
(x:xs) /\/ ys = x : (ys /\/ xs)
|
||||
|
||||
Reference in New Issue
Block a user