同じ画像を検索して削除する | ITに頼って生きていく

- 2月

2019

Posted By : boomin

No Comments

同じ画像を検索して削除する

1. 同じ画像を検索して、比較し、同じならば削除する

前回、機械学習用の画像の前処理として、教師データが重複する可能性を検討しました。

この時は、同じファイル名かつ、同じサイズ、同じチャンネル数の画像は、同じ画像である と判断して削除するということを行いました。

が、改めてよくよく収集した画像たちを見ると、同じ画像だけど大きさが異なる（リサイズされたもの） が、時々あることが判明ェ。。。。。。

これでは、本命/義理がどう組み合わさろうとも、全く同じ画像が複数枚存在してしまっては、学習に差し障ります。そこで、学習データ中から同じ画像を検索し、一方のみを残し一方を削除することを行います。

2. 処理手順

2.1 同じ画像を検索する

画像の検索ロジックとしては、

本命チョコ画像と義理チョコ画像の、両方の全画像一覧を作成
一覧中から1つの画像を選択し、当該画像以外のすべての画像との組み合わせで類似性を評価
ある基準以上に類似性が認められば、同じ画像として判定し削除

2.2 同じ画像かどうか判定する

こんなディレクトリ構造とします。

├─preprocess.py   # 前処理本体
├─ImageTooles.py  # preprocess.pyから呼び出される関数をまとめた部品
└──images
   ├─compImg  # 以下のcode実行で、同じと判定された画像の比較エビデンスを格納
   ├─giri     # 学習用の義理チョコ画像を格納
   └─honmei   # 学習用の本命チョコ画像を格納

ImageTooles.py

# -*- coding: utf-8 -*-
"""
Created on Sat Feb  2 13:32:13 2019
@author: boomin
"""

import os
from PIL import Image
import cv2
import numpy as np
import glob
import pandas as pd

def getImages(honImgDir, girImgDir):
  # 本命チョコ写真リスト
  hmImg  = glob.glob(honImgDir + "*.*")
  hmImg = pd.DataFrame({
    "filename":[os.path.basename(f) for f in hmImg],
    "fullpath":[f for f in hmImg]
  })
  # 義理チョコ写真リスト
  grImg  = glob.glob(girImgDir + "*.*")
  grImg = pd.DataFrame({
    "filename":[os.path.basename(f) for f in grImg],
    "fullpath":[f for f in grImg]
  })
  # 本命チョコと義理チョコのファイルを1つのDaraFrameににまとめる
  ifiles = pd.concat([hmImg,grImg], axis=0).reset_index(drop=True)
  
  # 画像shapeを取得
  ifiles["shape"] = [ getImageSize(x) for x in ifiles["fullpath"] ]
  
  # アスペクト比を追加
  ifiles["aratio"] = [ round(w/h, 2) if h!=0 else 0 for (w,h,c) in ifiles["shape"] ]

  return ifiles.set_index("fullpath")

def getImageSize(im):
  try:
    rimf = Image.open(im)
    isize = np.asarray(rimf).shape
    return isize if len(isize)==3 else (isize[0],isize[1],0)
  except:
    return (0,0,0)

# 同じ画像と判定された画像が見つかったら、すべて削除する
def removefile(file):
  if os.path.exists(file):
    os.remove(file)
    print(f"\tdeleted {file}")
    return file
  else:
    return ""

def convertImage(fpath,shape):
  root, ext = os.path.splitext(fpath)
  try:
    if len(shape)<3 or shape[2]!=3:
      print(f"converted from {shape} : {os.path.basename(fpath)}")
      rimf  = Image.open(fpath)
      rgb_im = rimf.convert('RGB')
      # 変換画像を保存
      rgb_im.save(f"{root}.jpg")
      # 変換元画像を削除
      removefile(fpath)
      return 1
    else:
      return 0
  except:
    try:
      removefile(fpath)
      print(f"\t removing by error: {fpath} ")
    except:
      print(f"cannot remove {fpath}")
    finally:
      return 0
      
def getImageProp(im1,im2):
  IMG_SIZE = (96, 96)
  img1 = cv2.resize(cv2.imread(im1), IMG_SIZE)
  img2 = cv2.resize(cv2.imread(im2), IMG_SIZE)
  gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
  gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
  detector = cv2.AKAZE_create()
  kp1, des1 = detector.detectAndCompute(gray1, None)
  kp2, des2 = detector.detectAndCompute(gray2, None)  

  bf = cv2.BFMatcher(cv2.NORM_HAMMING)
  matches = bf.match(des1, des2)
  
  return matches, img1, img2, kp1, kp2

def getSimFeature(im1,im2):
  try:
    matches, img1, img2, kp1, kp2 = getImageProp(im1,im2)
    dist = [m.distance for m in matches]
    ret = sum(dist) / len(dist)
    if ret<10:
      dname = "images" + os.sep + "compImg" + os.sep
      fname = os.path.basename(im2)
      showImg(dname+"comp_"+fname, matches, img1, img2, kp1, kp2)
    return ret
  except:
    return 100000
  
def feature_detection(im1,splist):
  return [ (im2, getSimFeature(im1,im2)) for im2 in splist ]


import matplotlib.pyplot as plt
def showImg(title, matches, img1, img2, kp1, kp2):
	matches = sorted(matches, key = lambda x:x.distance)
	img3 = cv2.drawMatches(img1, kp1, img2, kp2, matches, None, flags=2)
	plt.imshow(cv2.cvtColor(img3, cv2.COLOR_BGR2RGB))
	plt.savefig(f"{title}.png", bbox_inches='tight')

preprocess.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jan 15 14:00:06 2019
@author: boomin
"""

import os
import numpy as np

import multiprocessing
from joblib import Parallel, delayed
from itertools import chain

from ImageTooles import getImages, convertImage, feature_detection, removefile

# 本命チョコ画像の格納ディレクトリ
honImgDir  = "images" + os.sep + "honmei" + os.sep
# 義理チョコ画像の格納ディレクトリ
girImgDir  = "images" + os.sep + "giri"   + os.sep

print("loading Image files...")
ifiles = getImages(honImgDir, girImgDir)

print("deleting images which have strange aspect ratio...")
# アスペクト比が、極端に大きかったり小さかったりするものは削除する
iratio = 16./9. # アスペクト比の閾値は16:9にする
_ = [ removefile(fpath) for fpath in ifiles[(ifiles["aratio"]>iratio)|(ifiles["aratio"]<1/iratio)|(ifiles["aratio"]==0)].index ]
print(f"\tdelete {len(_)} files")

print("deleting images which have same shape and the name...")
# ファイル名と画像サイズ、チャンネル数を元に、重複しているファイルを抽出して削除
dupfiles = ifiles[ifiles.duplicated(subset=['filename', 'shape'], keep=False)]
_ = [ removefile(fpath) for fpath in dupfiles.index ]
print(f"\tdelete {len(_)} files")

# 画像の変換を行う
print("converting file format...")
done = [ convertImage(fpath,shape) for fpath,shape in zip(ifiles.index, ifiles['shape']) ]
print(f"{sum(done)} files are converted.")

# 今残っているファイル一覧を取得しなおす
print("re-get all target files...")
ifiles = getImages(honImgDir, girImgDir)

# 削除対象とする類似度の閾値
th = 10

print("deleting images which have same features...")
donelist = []
# 画像の特徴からして、同じものと判定された画像を削除
for i, im1 in enumerate(ifiles.index):
  # 処理済listに追加
  donelist.append(im1)
  # 対象と、同じようなアスペクト比の画像を抽出
  nsp = ifiles.at[im1,"aratio"]
  # 同じようなアスペクト比の画像であれば、im1の画像はdonelistへの追加処理処理時点で評価されたはずだから、これを除外する
  silist = ifiles[(ifiles["aratio"]>nsp*0.99)&(ifiles["aratio"]<nsp*1.01)]
  silist = [ n for n in silist.index if n not in donelist ]
  print(f"{i}/{len(ifiles)}: proving {len(silist)} files")
  # 画像の類似の評価はCPUを使うので、並列処理させる
  n_jobs = multiprocessing.cpu_count()
  splited = np.array_split(silist, n_jobs)
  results = Parallel(n_jobs=n_jobs, verbose=0) (
    [ delayed(feature_detection)(im1,splist) for splist in splited ]
  )
  # 結果をまとめる
  results = list(chain.from_iterable(results))
  # 類似度が閾値以下のものを抽出
  _ = [ removefile(f) for f,v in results if v<th ]

画像同士を総当たりで比較する必要があるため、非常に処理時間がかかってしまいます。そこで、

同じようなアスペクト比の画像だけを類似評価の比較対象にする
multiprocessingで並列処理をさせる

ことで、処理時間の短縮を図っています。

2.3 同じ画像として判定された画像の比較

こんな風に、同じ画像だと判断されたら、その比較画像として特徴点とともに出力します。たとえば、こんな感じです。

確かに、特徴的な点が示されていて、同じ画像であることがわかりますね！一覧にすると、こんな感じでした。

これらの画像を削除することで、いよいよ学習データがそろいました。次回以降、機械学習のモデルなどについて書いていきます。

ITに頼って生きていく

1. 同じ画像を検索して、比較し、同じならば削除する

2. 処理手順

2.1 同じ画像を検索する

2.2 同じ画像かどうか判定する

2.3 同じ画像として判定された画像の比較

関連

コメントを残すコメントをキャンセル