GitMining/apriori.hs

module Main where

import System.Environment
import qualified Data.List as List
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Map (Map)
import qualified Data.Map as Map
import Text.ParserCombinators.Parsec

-- data structures defined here

data Item = Item String deriving (Eq, Ord)
instance Show Item where
  show (Item s) = "Item " ++ s

data ItemSet = ItemSet (Set Item) deriving (Eq, Ord)
instance Show ItemSet where
  show (ItemSet x) = let y = foldl (++) "" $ map ((\x -> x ++ ",") . show) $ Set.toList x
                         in "{" ++ y ++ "}"

type Support = Double
type Confidence = Double

support :: [ItemSet] -> ItemSet -> Support
support iss is = (fromIntegral $ count iss is) / (fromIntegral $ length iss)
  where
    count iss (ItemSet is) = length $ filter (Set.isSubsetOf is) $ map (\(ItemSet x) -> x) iss

confidence :: [ItemSet] -> ItemSet -> Item -> Confidence
confidence iss i@(ItemSet is) j = (support iss $ ItemSet $ Set.union is $ Set.singleton j) / (support iss i)

-- parse CSV

csvFile = endBy line eol
line = do
  cells <- sepBy cell $ char ','
  return $ ItemSet $ Set.fromList cells
cell = do
  c <- many $ noneOf ",\r\n"
  return $ Item $ c
eol =   try (string "\n\r")
    <|> try (string "\r\n")
    <|> string "\n"
    <|> string "\r"

parseCSV :: String -> Either ParseError [ItemSet]
parseCSV input = parse csvFile ("unknown") input

-- frequent items
frequentItems :: [ItemSet] -> Support -> Map Item Integer
frequentItems x y = pi where
  pi = Map.filter (>= frequency) m where
  frequency = ceiling(y * (fromIntegral $ length x)) where
  m = foldl count Map.empty x where
  count m (ItemSet is) = Set.foldl f m is where
  f m i = Map.insertWith acc i 1 m where
  acc new old = old + 1

displayFrequentItems :: Map Item Integer -> String
displayFrequentItems i = Map.foldrWithKey (\key val old -> old ++ (itemToString key val) ++ "\n" ) "" i where
	itemToString (Item str) freq = str++" ("++(show freq)++")"

-- frequentItemSets :: [ItemSet] -> Support -> Map ItemSet Integer
-- frequentItemSets x y = pi where
--   pi = Map.filter (>= frequency) candidates where
--   frequency = ceiling(y * (fromIntegral $ length x)) where
--   candidates = foldl count Map.empty x where
--   count m (ItemSet is) = Set.foldl f m is where
--   f m i = Map.insertWith acc i 1 m where
--   acc new old = old + 1


main :: IO ()
main = do
  putStrLn "Apriori Algorithm for frequent itemsets mining"
  [f, s] <- getArgs
  c <- readFile f
  case parse csvFile "(stdin)" c of
    Left e -> do putStrLn "Error parsing input: "
                 print e
    Right r -> do
      let i = frequentItems r (read s)
      putStrLn $ displayFrequentItems i

-- helper functions defined here

allItems :: [ItemSet] -> [Item]
allItems iss = Set.toList $ foldl add Set.empty iss where
  add s (ItemSet is) =  Set.union s is

candidate :: [ItemSet] -> [Item] -> [ItemSet]
candidate x y = List.nub $ [ItemSet (Set.union (Set.singleton b) a) | (ItemSet a) <- x, b <- y, Set.notMember b a]