94 lines
2.9 KiB
Haskell
94 lines
2.9 KiB
Haskell
module Main where
|
|
|
|
import System.Environment
|
|
import qualified Data.List as List
|
|
import Data.Set (Set)
|
|
import qualified Data.Set as Set
|
|
import Data.Map (Map)
|
|
import qualified Data.Map as Map
|
|
import Text.ParserCombinators.Parsec
|
|
|
|
-- data structures defined here
|
|
|
|
data Item = Item String deriving (Eq, Ord)
|
|
instance Show Item where
|
|
show (Item s) = "Item " ++ s
|
|
|
|
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
|
|
instance Show ItemSet where
|
|
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> x ++ ",") . show) $ Set.toList x
|
|
in "{" ++ y ++ "}"
|
|
|
|
type Support = Double
|
|
type Confidence = Double
|
|
|
|
support :: [ItemSet] -> ItemSet -> Support
|
|
support iss is = (fromIntegral $ count iss is) / (fromIntegral $ length iss)
|
|
where
|
|
count iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
|
|
|
|
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
|
|
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
|
|
|
|
-- parse CSV
|
|
|
|
csvFile = endBy line eol
|
|
line = do
|
|
cells <- sepBy cell $ char ','
|
|
return $ ItemSet $ Set.fromList cells
|
|
cell = do
|
|
c <- many $ noneOf ",\r\n"
|
|
return $ Item $ c
|
|
eol = try (string "\n\r")
|
|
<|> try (string "\r\n")
|
|
<|> string "\n"
|
|
<|> string "\r"
|
|
|
|
parseCSV :: String -> Either ParseError [ItemSet]
|
|
parseCSV input = parse csvFile ("unknown") input
|
|
|
|
-- frequent items
|
|
frequentItems :: [ItemSet] -> Support -> Map Item Integer
|
|
frequentItems x y = pi where
|
|
pi = Map.filter (>= frequency) m where
|
|
frequency = ceiling(y * (fromIntegral $ length x)) where
|
|
m = foldl count Map.empty x where
|
|
count m (ItemSet is) = Set.foldl f m is where
|
|
f m i = Map.insertWith acc i 1 m where
|
|
acc new old = old + 1
|
|
|
|
displayFrequentItems :: Map Item Integer -> String
|
|
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
|
|
itemToString (Item str) freq = str++" ("++(show freq)++")"
|
|
|
|
-- frequentItemSets :: [ItemSet] -> Support -> Map ItemSet Integer
|
|
-- frequentItemSets x y = pi where
|
|
-- pi = Map.filter (>= frequency) candidates where
|
|
-- frequency = ceiling(y * (fromIntegral $ length x)) where
|
|
-- candidates = foldl count Map.empty x where
|
|
-- count m (ItemSet is) = Set.foldl f m is where
|
|
-- f m i = Map.insertWith acc i 1 m where
|
|
-- acc new old = old + 1
|
|
|
|
|
|
main :: IO ()
|
|
main = do
|
|
putStrLn "Apriori Algorithm for frequent itemsets mining"
|
|
[f, s] <- getArgs
|
|
c <- readFile f
|
|
case parse csvFile "(stdin)" c of
|
|
Left e -> do putStrLn "Error parsing input: "
|
|
print e
|
|
Right r -> do
|
|
let i = frequentItems r (read s)
|
|
putStrLn $ displayFrequentItems i
|
|
|
|
-- helper functions defined here
|
|
|
|
allItems :: [ItemSet] -> [Item]
|
|
allItems iss = Set.toList $ foldl add Set.empty iss where
|
|
add s (ItemSet is) = Set.union s is
|
|
|
|
candidate :: [ItemSet] -> [Item] -> [ItemSet]
|
|
candidate x y = List.nub $ [ItemSet (Set.union (Set.singleton b) a) | (ItemSet a) <- x, b <- y, Set.notMember b a]
|