Files
GitMining/apriori.hs
2015-04-02 10:16:04 +02:00

94 lines
2.9 KiB
Haskell

module Main where
import System.Environment
import qualified Data.List as List
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Map (Map)
import qualified Data.Map as Map
import Text.ParserCombinators.Parsec
-- data structures defined here
data Item = Item String deriving (Eq, Ord)
instance Show Item where
show (Item s) = "Item " ++ s
data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
instance Show ItemSet where
show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> x ++ ",") . show) $ Set.toList x
in "{" ++ y ++ "}"
type Support = Double
type Confidence = Double
support :: [ItemSet] -> ItemSet -> Support
support iss is = (fromIntegral $ count iss is) / (fromIntegral $ length iss)
where
count iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss
confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)
-- parse CSV
csvFile = endBy line eol
line = do
cells <- sepBy cell $ char ','
return $ ItemSet $ Set.fromList cells
cell = do
c <- many $ noneOf ",\r\n"
return $ Item $ c
eol = try (string "\n\r")
<|> try (string "\r\n")
<|> string "\n"
<|> string "\r"
parseCSV :: String -> Either ParseError [ItemSet]
parseCSV input = parse csvFile ("unknown") input
-- frequent items
frequentItems :: [ItemSet] -> Support -> Map Item Integer
frequentItems x y = pi where
pi = Map.filter (>= frequency) m where
frequency = ceiling(y * (fromIntegral $ length x)) where
m = foldl count Map.empty x where
count m (ItemSet is) = Set.foldl f m is where
f m i = Map.insertWith acc i 1 m where
acc new old = old + 1
displayFrequentItems :: Map Item Integer -> String
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
itemToString (Item str) freq = str++" ("++(show freq)++")"
-- frequentItemSets :: [ItemSet] -> Support -> Map ItemSet Integer
-- frequentItemSets x y = pi where
-- pi = Map.filter (>= frequency) candidates where
-- frequency = ceiling(y * (fromIntegral $ length x)) where
-- candidates = foldl count Map.empty x where
-- count m (ItemSet is) = Set.foldl f m is where
-- f m i = Map.insertWith acc i 1 m where
-- acc new old = old + 1
main :: IO ()
main = do
putStrLn "Apriori Algorithm for frequent itemsets mining"
[f, s] <- getArgs
c <- readFile f
case parse csvFile "(stdin)" c of
Left e -> do putStrLn "Error parsing input: "
print e
Right r -> do
let i = frequentItems r (read s)
putStrLn $ displayFrequentItems i
-- helper functions defined here
allItems :: [ItemSet] -> [Item]
allItems iss = Set.toList $ foldl add Set.empty iss where
add s (ItemSet is) = Set.union s is
candidate :: [ItemSet] -> [Item] -> [ItemSet]
candidate x y = List.nub $ [ItemSet (Set.union (Set.singleton b) a) | (ItemSet a) <- x, b <- y, Set.notMember b a]