Aug-14-2023, 02:28 AM
(This post was last modified: Aug-14-2023, 02:10 PM by deanhystad.)
This is quick and robust (I think). It uses numpy reshape() and concatenate() to pad the 24 bit integers to 32 bits. Probably not as quick as using as_strided(), but only takes 0.006 seconds to process a 1Mbyte file.
import numpy as np import sys def int24(bytes_): """Convert bytes to 24bit ints. Return numpy array of ints.""" # How many 3 byte ints are in bytes_? count = bytes_.shape[0] // 3 # Reshape bytes_ into 3 byte arrays. bytes_= bytes_[:count*3].reshape((count, 3)) # Pad with zeros to make 4 byte arrays if sys.byteorder == "little": padded = np.concatenate((bytes_, np.zeros((count, 1), dtype=np.uint8)), axis=1) else: padded = np.concatenate((np.zeros((count, 1), dtype=np.uint8), bytes_), axis=1) # Convert 4 byte arrays to 4 byte ints return np.frombuffer(padded.tobytes(), dtype=np.uint32) # Load file and convert to 24bit ints. bytes_ = np.fromfile('test.txt', dtype=np.uint8) asints = int24(bytes_) # Throw away values that are not in range x8A000...0x8F0000 inrange = asints[(asints >= 0x8A0000) & (asints < 0x8F0000)] # Get counts for each value. Save as tuple (count, hex value) counts = [(count, hex(value)) for value, count in zip(*np.unique((inrange), return_counts=True))] print(sorted(counts, reverse=True)[:10])And if the 8C/8D/8E can be anywhere in the file, at any offset, just shift the bytes_ array and resample.
# Load file and convert to 24bit ints. Shift the # starting point to get all 24 bit ints. bytes_ = np.fromfile('test.txt', dtype=np.uint8) asints = np.concatenate( (int24(bytes_), int24(bytes_[1:]), int24(bytes_[2:])) )