Python Forum
multiprocessing phash from every frame in folder
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
multiprocessing phash from every frame in folder
#1
Hello,

with great support from this forum I'm working on a tool which can detect hangers in a Super8-Film.

I need some support in using multiprocessing.

I create the hash string from frames in a folder. I'm planning to treat this occurrence with multiprocessing to speed it up.

Here is the complete code:

main_short.py:
import hanger_detection
from multiprocessing import Pool
import time

hangers = []
measure_time_start = time.time()
pool = Pool()
pool.map(hanger_detection.create_phash)
pool.close()
frame_hash_list = hanger_detection.create_phash()
hangers = hanger_detection.detect_hangers(frame_hash_list)
measure_time_end = time.time()
number_of_hangers = len(hangers)
hanger_detection.fill_hanger_information_in_excel(hangers)

print("frame_hash_list: " + str(frame_hash_list))
print("hangers: " + str(hangers))
print("Zeit: " + str(measure_time_end - measure_time_start))
print("number_of_hangers: " + str(number_of_hangers))
hanger_detection.py:
import os
from PIL import Image
import imagehash
import openpyxl
from itertools import zip_longest


def difference_count(a: str, b: str) -> int:
    """Count differences between a and b"""
    return sum(1 for a, b in zip_longest(a, b) if a != b)


def create_phash():
    frame_hash_list = []
    p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"
    obj = os.scandir(p)
    for entry in obj:
        # load frames
        frame = Image.open(p + str(entry.name))
        # create pHash
        # Compare hashes to determine whether the frames are the same or not
        frame_phash = str(imagehash.phash(frame))
        frame_hash_list.append(frame_phash)
    obj.close()
    return frame_hash_list


def detect_hangers(frame_hash_list, threshold: int = 0, min_count: int = 4):
    """Return list of "hangers" detected in frame_hash_list.
    A "hanger" is consecutive frames that are the same.

    frame_hash_list : list of frame hash strings.  Frames are considered
    same or different by counting the differences in their hash strings.

    threshold : Maximum number of diffences allowed for two frames to be
    considered "same".

    min_count : Minimum length of a hanger.  Short hangers aren't noticable
    and don't have to be removed.
    """
    hangers = []  # List of hanger start, stop frame indexes
    start_index = 0
    start_frame = frame_hash_list[0]
    for index, frame in enumerate(frame_hash_list[1:], start=1):
        # Are frame and start_frame disimilar enough?
        if difference_count(start_frame, frame) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_frame = frame
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers


def convert_frame_nr_in_time(d):
    # S8-Movie (avi-file) is checked of hangers
    #####################################################
    # 1 hour contains 72000 frames
    c1 = 72000
    # 1 minute contains 1200 frames
    c2 = 1200
    # 1 second contains 20 frames
    c3 = 20

    def find_even_frame_nr(a, b, c):
        while True:
            if a % c == 0:
                break
            else:
                a -= 1
                b += 1
        return a, b

    frame_nr_full_hour, rest_1 = find_even_frame_nr(d, 0, c1)
    number_of_hours = frame_nr_full_hour / c1
    ###########################################################
    frame_nr_full_minute, rest_2 = find_even_frame_nr(rest_1, 0, c2)
    number_of_minutes = frame_nr_full_minute / c2
    ###########################################################
    frame_nr_full_second, rest_3 = find_even_frame_nr(rest_2, 0, c3)
    number_of_seconds = frame_nr_full_second / c3

    return number_of_hours, number_of_minutes, number_of_seconds


def fill_hanger_information_in_excel(hangers):
    p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/S8-Hanger_Positionen.xlsx"
    fileXLSX = openpyxl.load_workbook(p)
    sheet = fileXLSX["Blatt"]
    # clear old hanger information
    # film doesn't have more than 100 hangers
    r = 5
    c = 2
    for z in range(r, r + 100):
        for s in range(c, c + 2):
            sheet.cell(row=z, column=s).value = None

    # fill in hanger information
    r = 5
    for i in hangers:
        frame_nr_hanger_start = i[0]
        frame_nr_hanger_end = i[1]
        number_of_hours_start, number_of_minutes_start, number_of_seconds_start = convert_frame_nr_in_time(
            frame_nr_hanger_start)
        number_of_hours_end, number_of_minutes_end, number_of_seconds_end = convert_frame_nr_in_time(
            frame_nr_hanger_end)
        number_of_hours_start_int = int(number_of_hours_start)
        number_of_minutes_start_int = int(number_of_minutes_start)
        number_of_seconds_start_int = int(number_of_seconds_start)
        number_of_hours_end_int = int(number_of_hours_end)
        number_of_minutes_end_int = int(number_of_minutes_end)
        number_of_seconds_end_int = int(number_of_seconds_end)
        number_of_hours_start_str = str(number_of_hours_start_int)
        if len(number_of_hours_start_str) == 1:
            number_of_hours_start_str = "0" + number_of_hours_start_str
        number_of_minutes_start_str = str(number_of_minutes_start_int)
        if len(number_of_minutes_start_str) == 1:
            number_of_minutes_start_str = "0" + number_of_minutes_start_str
        number_of_seconds_start_str = str(number_of_seconds_start_int)
        if len(number_of_seconds_start_str) == 1:
            number_of_seconds_start_str = "0" + number_of_seconds_start_str
        number_of_hours_end_str = str(number_of_hours_end_int)
        if len(number_of_hours_end_str) == 1:
            number_of_hours_end_str = "0" + number_of_hours_end_str
        number_of_minutes_end_str = str(number_of_minutes_end_int)
        if len(number_of_minutes_end_str) == 1:
            number_of_minutes_end_str = "0" + number_of_minutes_end_str
        number_of_seconds_end_str = str(number_of_seconds_end_int)
        if len(number_of_seconds_end_str) == 1:
            number_of_seconds_end_str = "0" + number_of_seconds_end_str

        # create timestamp
        timestamp_start_str = number_of_hours_start_str + ":" + number_of_minutes_start_str + ":" + number_of_seconds_start_str
        timestamp_end_str = number_of_hours_end_str + ":" + number_of_minutes_end_str + ":" + number_of_seconds_end_str
        sheet.cell(row=r, column=2).value = timestamp_start_str
        sheet.cell(row=r, column=4).value = timestamp_end_str
        r += 1
    fileXLSX.save(p)
In hanger_detection.py (line 17) I loop over all frames. There should be in some way the multiprocessing - but I don't know how...

The simple example following repeats a function 4 times:
from multiprocessing import Pool
import time

def cpu_extensive(i):
	time.sleep(2)
	print(i, "Done")

starttime = time.time()
pool = Pool()
pool.map(cpu_extensive, range(4))
pool.close()
endtime = time.time()
print(f"Time taken {endtime -starttime} seconds.")
But I need to treat all frames with multiprocessing (to get it treated parallel).

So I need to loop over all frames with multiprocessing and not repeat the whole function create_phash()...

I guess that in line 22 from hanger_detection.py I have to treat "frame_phash = str(imagehash.phash(frame))" with multiprocessing...

Would you please be so kind and tell me what I should do?

That would be great...

Thanks a lot...
Reply
#2
Hi,

normally the frames in the folder are treated sequentially and because of this frame_hash_list contains the hash strings in the right order.

I think if I will use multiprocessing the chronological order will be mixed up in frame_hash_list.

Because of this I think it could be a good idea to use a dictionary to store as key (=frame_nr) and as value (=hash_string).

Perhaps it is possible to sort the dictionary by the keys when the dictionary contains all keys and values...

When the dictionary is sorted the function detect_hangers should be applied to the dictionary...

Or the list frame_hash_list should contain only the values (hash_strings) of the dictionary.

Here is my attempt to create a hash string of each frame using multiprocessing:

main.py:

import hanger_detection
from multiprocessing import Pool
import time

p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"

frame_hash_dict = {}
hangers = []
pool = Pool()
measure_time_start = time.time()
obj = hanger_detection.scandir(p)

for entry in obj:
    frame_phash = pool.map(hanger_detection.create_phash(entry))
    # add new values
    frame_hash_dict[entry.name] = frame_phash
pool.close()

print("frame_hash_dict" + str(frame_hash_dict))

hangers = hanger_detection.detect_hangers(frame_hash_list)
measure_time_end = time.time()
number_of_hangers = len(hangers)
hanger_detection.fill_hanger_information_in_excel(hangers)

print("frame_hash_list: " + str(frame_hash_list))
print("hangers: " + str(hangers))
print("Zeit: " + str(measure_time_end - measure_time_start))
print("number_of_hangers: " + str(number_of_hangers))
hanger_detection.py:

import os
from PIL import Image
import imagehash
import openpyxl
from itertools import zip_longest


def difference_count(a: str, b: str) -> int:
    """Count differences between a and b"""
    return sum(1 for a, b in zip_longest(a, b) if a != b)


# def create_phash():
#     frame_hash_list = []
#     p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"
#     obj = os.scandir(p)
#     for entry in obj:
#         # load frames
#         frame = Image.open(p + str(entry.name))
#         # create pHash
#         # Compare hashes to determine whether the frames are the same or not
#         frame_phash = str(imagehash.phash(frame))
#         frame_hash_list.append(frame_phash)
#     obj.close()
#     return frame_hash_list


def scandir(p):
    obj = os.scandir(p)
    obj.close()
    return obj


frame_hash_dict = {}
p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"


def create_phash(entry):
    # load frames
    frame = Image.open(p + str(entry.name))
    # create pHash
    # Compare hashes to determine whether the frames are the same or not
    frame_phash = str(imagehash.phash(frame))
    return frame_phash


def detect_hangers(frame_hash_list, threshold: int = 0, min_count: int = 4):
    """Return list of "hangers" detected in frame_hash_list.
    A "hanger" is consecutive frames that are the same.

    frame_hash_list : list of frame hash strings.  Frames are considered
    same or different by counting the differences in their hash strings.

    threshold : Maximum number of diffences allowed for two frames to be
    considered "same".

    min_count : Minimum length of a hanger.  Short hangers aren't noticable
    and don't have to be removed.
    """
    hangers = []  # List of hanger start, stop frame indexes
    start_index = 0
    start_frame = frame_hash_list[0]
    for index, frame in enumerate(frame_hash_list[1:], start=1):
        # Are frame and start_frame disimilar enough?
        if difference_count(start_frame, frame) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_frame = frame
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers


def convert_frame_nr_in_time(d):
    # S8-Movie (avi-file) is checked of hangers
    #####################################################
    # 1 hour contains 72000 frames
    c1 = 72000
    # 1 minute contains 1200 frames
    c2 = 1200
    # 1 second contains 20 frames
    c3 = 20

    def find_even_frame_nr(a, b, c):
        while True:
            if a % c == 0:
                break
            else:
                a -= 1
                b += 1
        return a, b

    frame_nr_full_hour, rest_1 = find_even_frame_nr(d, 0, c1)
    number_of_hours = frame_nr_full_hour / c1
    ###########################################################
    frame_nr_full_minute, rest_2 = find_even_frame_nr(rest_1, 0, c2)
    number_of_minutes = frame_nr_full_minute / c2
    ###########################################################
    frame_nr_full_second, rest_3 = find_even_frame_nr(rest_2, 0, c3)
    number_of_seconds = frame_nr_full_second / c3

    return number_of_hours, number_of_minutes, number_of_seconds


def fill_hanger_information_in_excel(hangers):
    p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/S8-Hanger_Positionen.xlsx"
    fileXLSX = openpyxl.load_workbook(p)
    sheet = fileXLSX["Blatt"]
    # clear old hanger information
    # film doesn't have more than 100 hangers
    r = 5
    c = 2
    for z in range(r, r + 100):
        for s in range(c, c + 2):
            sheet.cell(row=z, column=s).value = None

    # fill in hanger information
    r = 5
    for i in hangers:
        frame_nr_hanger_start = i[0]
        frame_nr_hanger_end = i[1]
        number_of_hours_start, number_of_minutes_start, number_of_seconds_start = convert_frame_nr_in_time(
            frame_nr_hanger_start)
        number_of_hours_end, number_of_minutes_end, number_of_seconds_end = convert_frame_nr_in_time(
            frame_nr_hanger_end)
        number_of_hours_start_int = int(number_of_hours_start)
        number_of_minutes_start_int = int(number_of_minutes_start)
        number_of_seconds_start_int = int(number_of_seconds_start)
        number_of_hours_end_int = int(number_of_hours_end)
        number_of_minutes_end_int = int(number_of_minutes_end)
        number_of_seconds_end_int = int(number_of_seconds_end)
        number_of_hours_start_str = str(number_of_hours_start_int)
        if len(number_of_hours_start_str) == 1:
            number_of_hours_start_str = "0" + number_of_hours_start_str
        number_of_minutes_start_str = str(number_of_minutes_start_int)
        if len(number_of_minutes_start_str) == 1:
            number_of_minutes_start_str = "0" + number_of_minutes_start_str
        number_of_seconds_start_str = str(number_of_seconds_start_int)
        if len(number_of_seconds_start_str) == 1:
            number_of_seconds_start_str = "0" + number_of_seconds_start_str
        number_of_hours_end_str = str(number_of_hours_end_int)
        if len(number_of_hours_end_str) == 1:
            number_of_hours_end_str = "0" + number_of_hours_end_str
        number_of_minutes_end_str = str(number_of_minutes_end_int)
        if len(number_of_minutes_end_str) == 1:
            number_of_minutes_end_str = "0" + number_of_minutes_end_str
        number_of_seconds_end_str = str(number_of_seconds_end_int)
        if len(number_of_seconds_end_str) == 1:
            number_of_seconds_end_str = "0" + number_of_seconds_end_str

        # create timestamp
        timestamp_start_str = number_of_hours_start_str + ":" + number_of_minutes_start_str + ":" + number_of_seconds_start_str
        timestamp_end_str = number_of_hours_end_str + ":" + number_of_minutes_end_str + ":" + number_of_seconds_end_str
        sheet.cell(row=r, column=2).value = timestamp_start_str
        sheet.cell(row=r, column=4).value = timestamp_end_str
        r += 1
    fileXLSX.save(p)
I will be very thankfull if someone could help me out...

Thanks a lot...
Reply
#3
I don't think you need to worry about collating the results. Python multiprocessing can do that for you. This is a simple example showing how using Pool and map will organize the results.
from multiprocessing import Pool
import random
from time import sleep

def process(n):
    sleep(random.randint(0, 5))
    print("Exit", n)
    return n

if __name__ == "__main__":
    # Is VERY important to protect this code on Windows
    with Pool(processes=4) as pool:
        results = pool.map(process, range(1, 11))
        print(results)
Output:
Exit 2 Exit 1 Exit 6 Exit 4 Exit 8 Exit 5 Exit 9 Exit 7 Exit 10 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Even though the processes did not complete in the order they were called, the results are in the correct order.
Reply
#4
(Nov-10-2022, 10:19 PM)deanhystad Wrote:  # Is VERY important to protect this code on Windows

Hi deanhystad,

thanks a lot for your good answer!!

I don't understand what you mean with "protect the code on windows"...
Could you please explain it?

I wish you a great weekend...

Greetings,

flash77
Reply
#5
Take the "if __name__ == "__main__":" out and see what happens when you run the program. Come up with a theory about why the program acts so differently and let us know what you think.
Reply
#6
Hi,

I caught up on the idiom 'if __name__=="__main__"' on the internet...

Because I'm actually professional restrained I did not have the chance to test the information I found on my example.

I will notice you when I had the chance to try it out...

Here is the information I gathered and a little example to explain the idiom:

file.py
def function():
    ...

if __name__=="__main__":
    text = input()
    print(function(text))
If file.py is run as a script (function() is in the same script), the idiom returns True.
text = input()
print(function(text)) 
is executed.

If function() is imported from another module, the idiom returns False.
text = input()
print(function(text)) 
is NOT executed.

Nesting code under the idiom allows to cater to different use cases:

When file.py runs as a script, the code nested under the idiom is executed.

When file.py is imported as a module then function() gets defined, but the code nested under the idiom is not executed.
With this you can provide function() to your code (you can use it) without any side effects.

In top-level code enviroment __name__ of script is "__main__".

Then __name__ of an imported module is the module's name as a string.

I will notice you when I adapted the theory to my multiprocessing hanger detection.

Greetings,

flash77
Reply
#7
Please note:
if __name__ == '__main__':
    ...
Should only contain what you wish to run when executing your code from shell.
It is being misused in your code.

Anything within the condition will not get run if the module is imported into another module.
This is the expected behavior, and is in fact the main reason for the condition.
For example, it's very handy for executing one piece of a large project, or for unit testing.

see: https://docs.python.org/3/library/__main...nvironment
Reply
#8
Where is the misuse?
Reply
#9
deanHystad Wrote:Where is the misuse?

Perhaps misuse is too strong, but if there is any intention of importing the script into another module,
then, when the code is called from the other module the following will not get executed:

    with Pool(processes=4) as pool:
        results = pool.map(process, range(1, 11))
        print(results)
Reply
#10
You can (maybe should) put it in a function I suppose.
def launch(processes=4):
    """If on windows, whoever calls this function must hide behind "if __name__ == '__main__':" to prevent spawned
    processes from calling this function and creating their own process pool.
    """
    with Pool(processes=processes) as pool:
        return pool.map(process, range(1, 11))

if __name__ == "__main__":
    print(launch)
But it is very important, at least on Windows, that you provide protection against running the code inside the sub-processes. Windows spawns a new process and imports the program file to define the co-routine. Without hiding the pool creation behind "if __name__ ==", the spawned process will create a new pool and begin spawning it's own processes.
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Compare folder A and subfolder B and display files that are in folder A but not in su Melcu54 3 679 Jan-05-2024, 05:16 PM
Last Post: Pedroski55
  Compare filename with folder name and copy matching files into a particular folder shantanu97 2 4,658 Dec-18-2021, 09:32 PM
Last Post: Larz60+
  Move file from one folder to another folder with timestamp added end of file shantanu97 0 2,559 Mar-22-2021, 10:59 AM
Last Post: shantanu97
  Python Cut/Copy paste file from folder to another folder rdDrp 4 5,239 Aug-19-2020, 12:40 PM
Last Post: rdDrp
  Delete directories in folder is not working after folder is updated asheru93 2 2,724 Feb-13-2019, 12:37 PM
Last Post: asheru93
  copy content of folder to existing folder shlomi27 0 2,698 Aug-11-2018, 01:44 PM
Last Post: shlomi27

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020