rewrote the algorithm from scratch to implement semi-unions

This commit is contained in:
IGI-111
2015-04-03 16:05:22 +02:00
parent 99613bf333
commit e66fad749f

View File

@@ -1,12 +1,8 @@
module Main where module Main where
import System.Environment
import qualified Data.List as List
import Data.Set (Set) import Data.Set (Set)
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.Map (Map) import qualified Data.List as List
import qualified Data.Map as Map
import Text.ParserCombinators.Parsec
-- data structures defined here -- data structures defined here
@@ -16,85 +12,41 @@ instance Show Item where
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord) data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
instance Show ItemSet where instance Show ItemSet where
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> " " ++ x) . show) $ Set.toList x show (ItemSet x) = let y = foldl (++) "" $ map show $ Set.toList x in y
in y
type Support = Double type Frequency = Double
type Confidence = Double
type Frequency = Int semiUnion :: ItemSet -> ItemSet -> ItemSet
semiUnion (ItemSet set1) (ItemSet set2) = ItemSet (if max1 <= max2 && Set.delete max1 set1 == Set.delete max2 set2 then set1 `Set.union` set2 else Set.empty) where
max1 = Set.findMax set1
max2 = Set.findMax set2
-- generate all possible combinations from a set of singletons
generateLevels :: [Item] -> [[ItemSet]]
generateLevels singles = until (\x -> head x == lastLevel) (\x -> generateNextLevel (head x) : x) [firstLevel] where
firstLevel = map (\x -> ItemSet $ Set.fromList [x]) singles
lastLevel = [ItemSet $ Set.fromList singles]
generateNextLevel :: [ItemSet] -> [ItemSet]
generateNextLevel level = foldr (\value old -> generate value ++ old) [] level where
generate value = takeWhile (/= empty) (foldr (\x old -> semiUnion value x : old) [] (tail $ List.dropWhile (/= value) level)) where
empty = ItemSet $ Set.fromList []
frequency :: [ItemSet] -> ItemSet -> Frequency frequency :: [ItemSet] -> ItemSet -> Frequency
frequency iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss frequency table (ItemSet set) = setCount / fromIntegral (length table) where
setCount = fromIntegral $ length (filter (\(ItemSet row) -> set `Set.isSubsetOf` row) table)
support :: [ItemSet] -> ItemSet -> Support singletons :: [ItemSet] -> [Item]
support iss is = (fromIntegral $ frequency iss is) / (fromIntegral $ length iss) singletons table = Set.toList $ foldr (\(ItemSet row) old -> old `Set.union` row) (Set.fromList []) table where
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
-- parse CSV
csvFile = endBy line eol
line = do
cells <- sepBy cell $ char ','
return $ ItemSet $ Set.fromList cells
cell = do
c <- many $ noneOf ",\r\n"
return $ Item $ c
eol = try (string "\n\r")
<|> try (string "\r\n")
<|> string "\n"
<|> string "\r"
parseCSV :: String -> Either ParseError [ItemSet]
parseCSV input = parse csvFile ("unknown") input
-- frequent items
frequentItems :: [ItemSet] -> Support -> [Item]
frequentItems x y = Set.elems $ Map.keysSet pi where
pi = Map.filter (>= frequency) m where
frequency = ceiling(y * (fromIntegral $ length x)) where
m = foldl count Map.empty x where
count m (ItemSet is) = Set.foldl f m is where
f m i = Map.insertWith acc i 1 m where
acc new old = old + 1
displayFrequentItems :: Map ItemSet Frequency -> String frequentPatterns :: Frequency -> [ItemSet] -> [[ItemSet]]
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where frequentPatterns thresh table = until (\x -> [] == head x) (\x -> filterByFrequency (generateNextLevel (head x)) : x) [firstLevel] where
itemToString is freq = "("++(show freq)++")" ++ (show is) firstLevel = map (\x -> ItemSet $ Set.fromList [x]) (singletons table)
filterByFrequency = filter (\x -> frequency table x >= thresh)
frequentItemSets :: [ItemSet] -> [ItemSet] -> Support -> Map ItemSet Frequency
frequentItemSets r candidates threshold = Map.fromList $ map (\c -> (c, frequency r c)) filtered
where filtered = filter (\c -> support r c >= threshold) candidates
main :: IO() main :: IO()
main = do main = print $ frequentPatterns 1 table where
putStrLn "Apriori Algorithm for frequent itemsets mining" table = [ItemSet (Set.fromList [Item "a", Item "b", Item "c"]), ItemSet (Set.fromList [Item "a", Item "b"])]
[f, s] <- getArgs
c <- readFile f
case parse csvFile "(stdin)" c of
Left e -> do putStrLn "Error parsing input: "
print e
Right r -> do
let is = frequentItems r (read s)
let iss = frequentItemSets r (candidates is) (read s)
putStrLn $ displayFrequentItems iss
-- helper functions defined here
allItems :: [ItemSet] -> [Item]
allItems iss = Set.toList $ foldl add Set.empty iss where
add s (ItemSet is) = Set.union s is
candidates :: [Item] -> [ItemSet]
candidates x = map (\old -> ItemSet (Set.fromList old)) (powerset x)
powerset :: [a] -> [[a]]
powerset [] = [[]]
powerset (x:xs) = xss /\/ map (x:) xss
where xss = powerset xs
(/\/) :: [a] -> [a] -> [a]
[] /\/ ys = ys
(x:xs) /\/ ys = x : (ys /\/ xs)