rewrote the algorithm from scratch to implement semi-unions

This commit is contained in:
IGI-111
2015-04-03 16:05:22 +02:00
parent 99613bf333
commit e66fad749f

View File

@@ -1,12 +1,8 @@
module Main where
import System.Environment
import qualified Data.List as List
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Map (Map)
import qualified Data.Map as Map
import Text.ParserCombinators.Parsec
import qualified Data.List as List
-- data structures defined here
@@ -16,85 +12,41 @@ instance Show Item where
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
instance Show ItemSet where
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> " " ++ x) . show) $ Set.toList x
in y
show (ItemSet x) = let y = foldl (++) "" $ map show $ Set.toList x in y
type Support = Double
type Confidence = Double
type Frequency = Int
type Frequency = Double
semiUnion :: ItemSet -> ItemSet -> ItemSet
semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where
max1 = Set.findMax set1
max2 = Set.findMax set2
-- generate all possible combinations from a set of singletons
generateLevels :: [Item] -> [[ItemSet]]
generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where
firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles
lastLevel = [ItemSet $ Set.fromList singles]
generateNextLevel :: [ItemSet] -> [ItemSet]
generateNextLevel level = foldr (\value old -> generate value ++ old) [] level where
generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) where
empty = ItemSet $ Set.fromList []
frequency :: [ItemSet] -> ItemSet -> Frequency
frequency iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
frequency table (ItemSet set) = setCount / fromIntegral (length table) where
setCount = fromIntegral $ length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table)
support :: [ItemSet] -> ItemSet -> Support
support iss is = (fromIntegral $ frequency iss is) / (fromIntegral $ length iss)
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
-- parse CSV
csvFile = endBy line eol
line = do
cells <- sepBy cell $ char ','
return $ ItemSet $ Set.fromList cells
cell = do
c <- many $ noneOf ",\r\n"
return $ Item $ c
eol = try (string "\n\r")
<|> try (string "\r\n")
<|> string "\n"
<|> string "\r"
parseCSV :: String -> Either ParseError [ItemSet]
parseCSV input = parse csvFile ("unknown") input
-- frequent items
frequentItems :: [ItemSet] -> Support -> [Item]
frequentItems x y = Set.elems $ Map.keysSet pi where
pi = Map.filter (>= frequency) m where
frequency = ceiling(y * (fromIntegral $ length x)) where
m = foldl count Map.empty x where
count m (ItemSet is) = Set.foldl f m is where
f m i = Map.insertWith acc i 1 m where
acc new old = old + 1
singletons :: [ItemSet] -> [Item]
singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where
displayFrequentItems :: Map ItemSet Frequency -> String
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
itemToString is freq = "("++(show freq)++")" ++ (show is)
frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]]
frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where
firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table)
filterByFrequency = filter (\x -> frequency table x >= thresh)
frequentItemSets :: [ItemSet] -> [ItemSet] -> Support -> Map ItemSet Frequency
frequentItemSets r candidates threshold = Map.fromList $ map (\c -> (c, frequency r c)) filtered
where filtered = filter (\c -> support r c >= threshold) candidates
main :: IO()
main = do
putStrLn "Apriori Algorithm for frequent itemsets mining"
[f, s] <- getArgs
c <- readFile f
case parse csvFile "(stdin)" c of
Left e -> do putStrLn "Error parsing input: "
print e
Right r -> do
let is = frequentItems r (read s)
let iss = frequentItemSets r (candidates is) (read s)
putStrLn $ displayFrequentItems iss
main = print $ frequentPatterns 1 table where
table = [ItemSet (Set.fromList [Item "a", Item "b", Item "c"]), ItemSet (Set.fromList [Item "a", Item "b"])]
-- helper functions defined here
allItems :: [ItemSet] -> [Item]
allItems iss = Set.toList $ foldl add Set.empty iss where
add s (ItemSet is) = Set.union s is
candidates :: [Item] -> [ItemSet]
candidates x = map (\old -> ItemSet (Set.fromList old)) (powerset x)
powerset :: [a] -> [[a]]
powerset [] = [[]]
powerset (x:xs) = xss /\/ map (x:) xss
where xss = powerset xs
(/\/) :: [a] -> [a] -> [a]
[] /\/ ys = ys
(x:xs) /\/ ys = x : (ys /\/ xs)