Compare commits

...

13 Commits

Author SHA1 Message Date
7e448da839 Use comonads 2019-01-24 21:02:27 +02:00
8833880c22 Clean up 2019-01-02 20:13:20 +02:00
53b86d7efb Parameterize on range and algorithm 2019-01-02 20:08:30 +02:00
27a236619e Optional range 2019-01-02 20:04:46 +02:00
4139562a49 Remove the intermediary list 2019-01-02 19:05:44 +02:00
22c19d0729 Print current state 2019-01-02 18:12:30 +02:00
61a6efe94e fromList and toList methods 2019-01-02 18:10:31 +02:00
08fdd68302 Fix memory profile 2019-01-02 08:02:40 +02:00
b8884d4bf4 Strictness 2019-01-01 23:47:32 +02:00
c7e074b73f Base feature set done 2019-01-01 23:36:58 +02:00
6649744654 Debug printing 2019-01-01 23:14:34 +02:00
b61ca54011 Record to fingerprint 2019-01-01 21:06:48 +02:00
dbb5cbb209 BKTree implementation 2019-01-01 20:43:33 +02:00
3 changed files with 144 additions and 17 deletions

View File

@ -18,12 +18,13 @@ cabal-version: >=1.10
executable imageduplicates
main-is: Main.hs
ghc-options: -Wall -threaded -rtsopts -O2
-- other-modules:
other-modules: Data.BKTree
-- other-extensions:
build-depends: base >=4.11 && <4.12
, JuicyPixels
, JuicyPixels-extra
, bytestring
, either
, generic-lens
, lens
, mtl
@ -31,7 +32,10 @@ executable imageduplicates
, pipes
, pipes-files
, pipes-safe
, recursion-schemes
, text
, transformers
, directory
, filepath
hs-source-dirs: src
default-language: Haskell2010

71
src/Data/BKTree.hs Normal file
View File

@ -0,0 +1,71 @@
{-# LANGUAGE DeriveFoldable #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE KindSignatures #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
module Data.BKTree where
import Data.Functor.Foldable
import Data.Functor.Foldable.TH
import GHC.Generics (Generic)
import Data.List (foldl')
import Data.Foldable (foldMap)
import Data.Monoid (Endo(..))
-- Point for testing purposes
data Point = Point Int Int deriving Show
instance Metric Point where
distance (Point p1 p2) (Point q1 q2) = abs (p1 - q1) + abs (p2 - q2)
class Metric a where
distance :: a -> a -> Int
data Tuple a = Tuple !Int a deriving (Show, Functor, Foldable, Traversable)
data BKTree a = Empty
| Node !a [Tuple (BKTree a)] deriving (Show, Generic, Functor, Traversable, Foldable)
makeBaseFunctor ''BKTree
empty :: BKTree a
empty = Empty
singleton :: Metric a => a -> BKTree a
singleton a = insert a empty
fromList :: Metric a => [a] -> BKTree a
fromList = foldl' (\acc x -> insert x acc) empty
toList :: BKTree a -> [a]
toList tree = appEndo (foldMap (\x -> Endo ([x] ++)) tree) []
insert :: Metric a => a -> BKTree a -> BKTree a
insert a = \case
Empty -> Node a []
Node b children ->
let newDistance = distance a b
in Node b (addChild newDistance children)
where
addChild d = \case
[] -> (Tuple d (insert a Empty)) : []
(Tuple d' child):children | d == d' -> (Tuple d' (insert a child)) : children
| otherwise -> (Tuple d' child) : addChild d children
search :: forall a. Metric a => Int -> a -> BKTree a -> [a]
search n a tree = cata alg tree
where
alg :: BKTreeF a [a] -> [a]
alg = \case
EmptyF -> []
NodeF x children ->
let thisDistance = distance a x
upper = thisDistance + n
lower = thisDistance - n
filteredChildren = concat [xs | Tuple d xs <- children, d <= upper, d >= lower]
in if thisDistance <= n then x : filteredChildren else filteredChildren

View File

@ -1,45 +1,97 @@
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-}
module Main where
import Codec.Picture
import Codec.Picture.Extra (scaleBilinear)
import Data.Bifunctor (second)
import Control.Comonad (extend, extract)
import Control.Comonad.Store (Store, experiment, seek, store)
import Control.Exception (SomeException, try)
import Control.Monad (forM_, void, when)
import Data.Bits
import qualified Data.BKTree as BK
import Data.List (foldl')
import Data.Maybe (fromMaybe)
import Data.Word (Word64)
import Options.Generic
import Pipes
import Pipes.Files
import qualified Pipes.Prelude as P
import Pipes.Safe (runSafeT)
import System.Directory (createDirectoryIfMissing,
createFileLink)
import System.FilePath (takeFileName, (</>))
data Alg = Average | DHash deriving (Read, Show, Generic)
instance ParseField Alg
data Cmd = Cmd { source :: FilePath
, target :: FilePath
, recursive :: Bool
, range :: Maybe Int
, algorithm :: Maybe Alg
} deriving (Show, Generic, ParseRecord)
fingerprint :: DynamicImage -> Word64
fingerprint = hash . grey . scale . convertRGB8
data Fingerprint =
Fingerprint { imagePath :: FilePath
, hash :: !Word64
} deriving Show
instance BK.Metric Fingerprint where
-- hamming distance
distance (Fingerprint _ a) (Fingerprint _ b) =
let xored = a `xor` b
in foldr (\shiftA acc -> acc + if 1 `shift` shiftA .&. xored > 0 then 1 else 0) 0 [0..63]
fingerprint :: Alg -> DynamicImage -> Word64
fingerprint alg = hash . grey . scale . convertRGB8
where
scale :: Image PixelRGB8 -> Image PixelRGB8
scale = scaleBilinear 8 8
grey :: Image PixelRGB8 -> Image Pixel8
grey = pixelMap (\(PixelRGB8 r g b) -> ceiling ((fromIntegral r * (0.3 :: Double)) + (fromIntegral g * 0.59) + (fromIntegral b * 0.11)))
indexes = [(x,y) | x <- [0..7], y <- [0..7]]
hashWith :: Image Pixel8 -> (Store (Int, Int) Pixel8 -> Bool) -> Word64
hashWith img f =
let s = store (uncurry (pixelAt img)) (0,0)
img' = extend f s
at pos = extract $ seek pos img'
in foldr (\(shiftA, b) acc -> if b then 1 `shift` shiftA .|. acc else acc) 0 $ zip [0..] $ map at indexes
hash :: Image Pixel8 -> Word64
hash img = -- the average fingerprint method
let avg = fromIntegral (foldl' (\acc (x,y) -> acc + fromIntegral (pixelAt img x y)) (0 :: Int) [(x,y) | x <- [0..7], y <- [0..7]] `div` 64)
in foldr (\(shiftA, b) acc -> if b then 1 `shift` shiftA .|. acc else acc) 0 $ zip [0..] [pixelAt img x y > avg | x <- [0..7], y <- [0..7]]
hash img = case alg of
Average ->
let avg = fromIntegral (foldl' (\acc (x,y) -> acc + fromIntegral (pixelAt img x y)) (0 :: Int) [(x,y) | (x,y) <- indexes] `div` 64)
in img `hashWith` (avgAlg avg)
DHash -> img `hashWith` dAlg
dAlg :: Store (Int, Int) Pixel8 -> Bool
dAlg img = case experiment (\(x,y) -> [(x,y), (succ x `mod` 8, y)]) img of
[l,r] -> l > r
_ -> False
avgAlg :: Pixel8 -> Store (Int, Int) Pixel8 -> Bool
avgAlg avg img = extract img > avg
main :: IO ()
main = do
Cmd{..} <- getRecord "Image duplicate finder"
runSafeT $
runEffect $
for (find source (glob "*.jpg" <> regular) >-> P.mapM (\path -> fmap (path,) <$> liftIO (readImage path)) >-> P.map (fmap (second fingerprint))) (liftIO . print)
putStrLn "Hello, Haskell!"
index <- runSafeT (P.fold foldTree BK.empty id (find source (glob "*.jpg" <> regular)
>-> P.mapM readImg
>-> P.map (fmap (toFingerprint (fromMaybe Average algorithm)))))
forM_ index $ \fp -> do
let similar = BK.search (fromMaybe 1 range) fp index
when (length similar > 1) $ do
print similar
let targetDir = target </> show (hash fp)
createDirectoryIfMissing True targetDir
forM_ similar $ \fp' ->
void $ try @SomeException (createFileLink (imagePath fp') (targetDir </> takeFileName (imagePath fp')))
where
readImg path = liftIO (putStrLn path >> fmap (path,) <$> readImage path)
foldTree acc = either (const acc) (\x -> x `seq` BK.insert x acc)
toFingerprint alg (path,img) = Fingerprint path (fingerprint alg img)