more compressed files

6cb62a30 · Bryce Hepner · 96e7a95d · 6cb62a30 · 6cb62a30 · 6cb62a30
Commit 6cb62a30 authored Jun 08, 2022 by Bryce Hepner
31 changed files
--- a/WorkingPyDemo.py
+++ b/WorkingPyDemo.py
@@ -8,6 +8,7 @@ from scipy.optimize import minimize,linprog
 from sklearn.neighbors import KernelDensity
 from collections import Counter
 import numpy.linalg as la
+from time import time
 def file_extractor(dirname="images"):
    files = os.listdir(dirname)
@@ -120,7 +121,7 @@ def predict_pix(tiff_image_path, difference = True):
    # calculate the error
    error = np.ravel(image_array[1:-1,1:-1])-predict
-    return image_array, predict, diff, error
+    return image_array, diff, error
 """
 this huffman encoding code is found online
@@ -173,3 +174,362 @@ def decode_string(huffman_string, the_keys, the_values):
            return (int(the_keys[the_values.index(huffman_string[:i+1])]),huffman_string[i+1:])
        except:
            pass
+def make_dictionary(tiff_image_path_list, num_bins=4, difference = True):
+    """
+    This function is used to encode the error based on the difference
+    and split the difference into different bins
+    Input:
+    tiff_image_path     (string): path to the tiff file
+    num_bins            (int): number of bins
+    Return:
+    huffman_encoding_list  list    (num_bins + 1): a list of dictionary
+    image_array            ndarray (512, 640): original image
+    new_error              ndarray (512, 640): error that includes the boundary
+    diff                   ndarray (510, 638): difference of min and max of the 4 neighbors
+    boundary               ndarray (2300,): the boundary values after subtracting the very first pixel value
+    predict                ndarray (325380,): the list of predicted values
+    bins                   list    (num_bins - 1,): a list of threshold to cut the bins
+    A                      ndarray (3 X 3): system of equation
+    """
+    list_of_all_vals = []
+    huffman_encoding_list = []
+    for _ in range(num_bins+1):
+        list_of_all_vals.append([])
+    for _, tiff_image_path in enumerate(tiff_image_path_list):
+        # get the image_array, etc
+        image_array, diff, error= predict_pix(tiff_image_path, difference)
+        bins = [21,32,48]
+        # get the boundary 
+        boundary = np.hstack((image_array[0,:],image_array[-1,:],image_array[1:-1,0],image_array[1:-1,-1]))
+        # take the difference of the boundary with the very first pixel
+        boundary = boundary - image_array[0,0]
+        #boundary is 1dim, so boundary[0] is just the first element
+        boundary[0] = image_array[0,0]
+        # huffman encode the boundary
+        for j in boundary:
+            list_of_all_vals[0].append(str(j))
+        # create a list of huffman table
+        n = len(bins)
+        # loop through different bins
+        for k in range (0,n):
+            # the first bin
+            if k == 0 :
+                # get the point within the bin and huffman huffman_encoding_dict
+                mask = diff <= bins[k]
+                for j in error[mask].astype(int):
+                    list_of_all_vals[k+1].append(str(j))
+            # the middle bins
+            else:
+                # get the point within the bin and huffman huffman_encoding_dict
+                mask = diff > bins[k-1]
+                new_error = error[mask]
+                mask2 = diff[mask] <= bins[k]
+                for j in new_error[mask2].astype(int):
+                    list_of_all_vals[k+1].append(str(j))
+        # the last bin       
+        # get the point within the bin and huffman huffman_encoding_dict
+        mask = diff > bins[-1]
+        for j in error[mask].astype(int):
+            list_of_all_vals[-1].append(str(j))
+    for item in list_of_all_vals:
+        freq = dict(Counter(item))
+        freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+        node = make_tree(freq)
+        huffman_encoding_list.append(huffman_code_tree(node))
+        # create a error matrix that includes the boundary (used in encoding matrix)
+    new_error = np.copy(image_array)
+    new_error[1:-1,1:-1] = np.reshape(error,(510, 638))
+    keep = new_error[0,0]
+    new_error[0,:] = new_error[0,:] - keep
+    new_error[-1,:] = new_error[-1,:] - keep
+    new_error[1:-1,0] = new_error[1:-1,0] - keep
+    new_error[1:-1,-1] = new_error[1:-1,-1] - keep
+    new_error[0,0] = keep
+        # huffman_encoding_list = list(set(huffman_encoding_list))
+    diff = np.reshape(diff,(510,638))
+        # return the huffman dictionary
+    return huffman_encoding_list,bins
+def huffman(tiff_image_path, num_bins=4, difference = True):
+    """
+    This function is used to encode the error based on the difference
+    and split the difference into different bins
+    Input:
+    tiff_image_path     (string): path to the tiff file
+    num_bins            (int): number of bins
+    Return:
+    huffman_encoding_list  list    (num_bins + 1): a list of dictionary
+    image_as_array         ndarray (512, 640): original image
+    new_error              ndarray (512, 640): error that includes the boundary
+    diff                   ndarray (510, 638): difference of min and max of the 4 neighbors
+    boundary               ndarray (2300,): the boundary values after subtracting the very first pixel value
+    predict                ndarray (325380,): the list of predicted values
+    bins                   list    (num_bins - 1,): a list of threshold to cut the bins
+    A                      ndarray (3 X 3): system of equation
+    """
+    # get the image_as_array, etc
+    image_as_array, diff, error= predict_pix(tiff_image_path, difference)
+    # calculate the number of points that will go in each bin
+    # sort the difference and create the bins
+    bins = [21,32,48]
+    # get the boundary 
+    boundary = np.hstack((image_as_array[0,:],image_as_array[-1,:],image_as_array[1:-1,0],image_as_array[1:-1,-1]))
+    # take the difference of the boundary with the very first pixel
+    boundary = boundary - image_as_array[0,0]
+    #boundary is 1dim, so boundary[0] is just the first element
+    boundary[0] = image_as_array[0,0]
+    # huffman encode the boundary
+    bound_vals_as_string = [str(i) for i in boundary]
+    freq = dict(Counter(bound_vals_as_string))
+    freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+    node = make_tree(freq)
+    huffman_encoding_dict = huffman_code_tree(node)
+    # create a list of huffman table
+    huffman_encoding_list = [huffman_encoding_dict]
+    n = len(bins)
+    # loop through different bins
+    for i in range (0,n):
+        # the first bin
+        if i == 0 :
+            # get the point within the bin and huffman huffman_encoding_dict
+            mask = diff <= bins[i]
+            line_as_string = [str(i) for i in error[mask].astype(int)]
+            freq = dict(Counter(line_as_string))
+            freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+            node = make_tree(freq)
+            huffman_encoding_dict = huffman_code_tree(node)
+            huffman_encoding_list.append(huffman_encoding_dict)
+        # the middle bins
+        else:
+            # get the point within the bin and huffman huffman_encoding_dict
+            mask = diff > bins[i-1]
+            new_error = error[mask]
+            mask2 = diff[mask] <= bins[i]
+            line_as_string = [str(i) for i in new_error[mask2].astype(int)]
+            freq = dict(Counter(line_as_string))
+            freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+            node = make_tree(freq)
+            huffman_encoding_dict = huffman_code_tree(node)
+            huffman_encoding_list.append(huffman_encoding_dict)
+    # the last bin       
+    # get the point within the bin and huffman huffman_encoding_dict
+    mask = diff > bins[-1]
+    line_as_string = [str(i) for i in error[mask].astype(int)]
+    freq = dict(Counter(line_as_string))
+    freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+    node = make_tree(freq)
+    huffman_encoding_dict = huffman_code_tree(node)
+    huffman_encoding_list.append(huffman_encoding_dict)
+    # create a error matrix that includes the boundary (used in encoding matrix)
+    new_error = np.copy(image_as_array)
+    new_error[1:-1,1:-1] = np.reshape(error,(510, 638))
+    keep = new_error[0,0]
+    new_error[0,:] = new_error[0,:] - keep
+    new_error[-1,:] = new_error[-1,:] - keep
+    new_error[1:-1,0] = new_error[1:-1,0] - keep
+    new_error[1:-1,-1] = new_error[1:-1,-1] - keep
+    new_error[0,0] = keep
+    # huffman_encoding_list = list(set(huffman_encoding_list))
+    diff = np.reshape(diff,(510,638))
+    # return the huffman dictionary
+    return image_as_array, new_error, diff
+def encoder(error, list_dic, diff, bins):
+    """
+    This function encode the matrix with huffman coding tables
+    Input:
+    error     (512, 640): a matrix with all the errors
+    list_dic  (num_dic + 1,): a list of huffman coding table 
+    bins       (num_bins - 1,): a list of threshold to cut the bins
+    Return:
+    encoded   (512, 640): encoded matrix
+    """
+    returnable_encode = ""
+    # copy the error matrix (including the boundary)
+    encoded = np.copy(error).astype(int).astype(str).astype(object)
+    #diff = np.reshape(diff,(510,638))
+    # loop through all the pixel to encode
+    for i in range(encoded.shape[0]):
+        for j in range(encoded.shape[1]):
+            if i == 0 or i == encoded.shape[0]-1 or j == 0 or j == encoded.shape[1]-1:
+                returnable_encode += list_dic[0][encoded[i][j]]
+            elif diff[i-1][j-1] <= bins[0]:
+                returnable_encode += list_dic[1][encoded[i][j]]
+            elif diff[i-1][j-1] <= bins[1] and diff[i-1][j-1] > bins[0]:
+                returnable_encode +=list_dic[2][encoded[i][j]]
+            elif diff[i-1][j-1] <= bins[2] and diff[i-1][j-1] > bins[1]:
+                returnable_encode +=list_dic[3][encoded[i][j]]
+            else:
+                returnable_encode += list_dic[4][encoded[i][j]]
+    return returnable_encode
+def decoder(encoded_string, list_dic, bins, use_diff):
+    """
+    This function decodes the encoded_matrix.
+    Input:
+    A               (3 X 3): system of equation
+    list_dic        (num_dic + 1,): a list of huffman coding table 
+    encoded_matrix  (512, 640): encoded matrix
+    bins            (num_bins - 1,): a list of threshold to cut the bins
+    Return:
+    decode_matrix   (512, 640): decoded matrix
+    """
+    A = np.array([[3,0,-1],[0,3,3],[1,-3,-4]]) # the matrix for system of equation
+    # change the dictionary back to list
+    # !!!!!WARNING!!!! has to change this part, everytime you change the number of bins
+    the_keys0 = list(list_dic[0].keys())
+    the_values0 = list(list_dic[0].values())
+    the_keys1 = list(list_dic[1].keys())
+    the_values1 = list(list_dic[1].values())
+    the_keys2 = list(list_dic[2].keys())
+    the_values2 = list(list_dic[2].values())
+    the_keys3 = list(list_dic[3].keys())
+    the_values3 = list(list_dic[3].values())
+    the_keys4 = list(list_dic[4].keys())
+    the_values4 = list(list_dic[4].values())
+    #Matrix system of points that will be used to solve the least squares fitting hyperplane
+    points = np.array([[-1,-1,1], [-1,0,1], [-1,1,1], [0,-1,1]])
+    decode_matrix = np.zeros((512,640))
+    # loop through all the element in the matrix
+    for i in range(decode_matrix.shape[0]):
+        for j in range(decode_matrix.shape[1]):
+            # if it's the very first pixel on the image
+            if i == 0 and j == 0:
+                colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys0, the_values=the_values0)
+                decode_matrix[i][j] = colorvalue
+            # if it's on the boundary (any of the 4 edges)
+            elif i == 0 or i == decode_matrix.shape[0]-1 or j == 0 or j == decode_matrix.shape[1]-1:
+                colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys0, the_values=the_values0)
+                decode_matrix[i][j] = colorvalue + decode_matrix[0][0]
+            # if not the boundary
+            else:
+                # predict the image with the known pixel value
+                z0 = decode_matrix[i-1][j-1]
+                z1 = decode_matrix[i-1][j]
+                z2 = decode_matrix[i-1][j+1]
+                z3 = decode_matrix[i][j-1]
+                y0 = int(-z0+z2-z3)
+                y1 = int(z0+z1+z2)
+                y2 = int(-z0-z1-z2-z3)
+                y = np.vstack((y0,y1,y2))
+                if use_diff:
+                    difference = max(z0,z1,z2,z3) - min(z0,z1,z2,z3)
+                else:
+                    f, difference, rank, s = la.lstsq(points, [z0,z1,z2,z3], rcond=None) 
+                    difference = difference.astype(int)
+                predict = np.round(np.round(np.linalg.solve(A,y)[-1][0],1))
+                # add on the difference by searching the dictionary
+                # !!!!!WARNING!!!! has to change this part, eveytime you change the number of bins
+                if difference <= bins[0]:
+                    colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys1, the_values=the_values1)
+                    decode_matrix[i][j] = colorvalue + int(predict)
+                elif difference <= bins[1] and difference > bins[0]:
+                    colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys2, the_values=the_values2)
+                    decode_matrix[i][j] = colorvalue + int(predict)
+                elif difference <= bins[2] and difference > bins[1]:
+                    colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys3, the_values=the_values3)
+                    decode_matrix[i][j] = colorvalue + int(predict)
+                else:
+                    colorvalue, encoded_string = decode_string(encoded_string,the_keys=the_keys4, the_values=the_values4)
+                    decode_matrix[i][j] = colorvalue + int(predict)
+    return decode_matrix.astype(int)
+def read_from_file(filename):
+    with open(filename, 'rb') as file:
+        return file.read()
+def bitstring_to_bytes(input_string):
+    int_array = []
+    length_of_string = len(input_string)
+    while length_of_string >= 8:
+        int_array.append(int(input_string[:8],2))
+        input_string = input_string[8:]
+        length_of_string = len(input_string)
+    if length_of_string > 0:
+        zerobuffer = ""
+        for _ in range(8-length_of_string):
+            zerobuffer += "0"
+        int_array.append(int(input_string+zerobuffer,2))
+    return bytes(int_array)
+def bytes_to_bitstring(input_bytearray):
+    end_string = ""
+    int_array = [i for i in input_bytearray]
+    for i, item in enumerate(int_array):
+        end_string += (bin(item)[2:].zfill(8))
+    return end_string
+starttime = time()
+scenes = file_extractor()
+newnamesforlater = []
+images = image_extractor(scenes)
+list_dic, ogbins = make_dictionary(images[0:10], 4, False)
+file_size_ratios = []
+np.save("first_dic.npy", list_dic)
+for i in range(10):
+    image, new_error, diff = huffman(images[i], 4, False)
+    encoded_string1 = encoder(new_error, list_dic, diff, ogbins)
+    # reconstruct_image = decoder(A, encoded_string, list_dic, bins, False)
+    # print(np.allclose(image, reconstruct_image))
+    inletters = bitstring_to_bytes(encoded_string1)
+    if images[i][:-5] == ".tiff":
+        newname = images[i][:-5]
+    else:
+        newname = images[i][:-4]
+    newnamesforlater.append(newname + "_Compressed.txt")
+    with open(newname + "_Compressed.txt", 'wb') as f:
+        f.write(inletters)
+list_dic = np.load("first_dic.npy", allow_pickle="TRUE")
+ogbins = [21,32,48]
+for i,item in enumerate(newnamesforlater[0:10]):
+    image, new_error, diff = huffman(images[i], 4, False)
+    encoded_string2 = bytes_to_bitstring(read_from_file(item))
+    reconstruct_image = decoder(encoded_string2, list_dic, ogbins, False)
+    print(np.allclose(image, reconstruct_image))
+print(time() - starttime)
\ No newline at end of file
--- a/images/1626032610_393963/1626032610_393963_0._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_0._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_1._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_1._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_10._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_10._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_11._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_11._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_12._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_12._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_13._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_13._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_14._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_14._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_15._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_15._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_2._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_2._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_3._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_3._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_4._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_4._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_5._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_5._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_7._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_7._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_8._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_8._Compressed.txt
--- a/images/1626032610_393963/1626032610_393963_9._Compressed.txt
+++ b/images/1626032610_393963/1626032610_393963_9._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_0._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_0._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_1._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_1._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_10._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_10._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_11._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_11._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_12._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_12._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_13._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_13._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_14._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_14._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_15._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_15._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_2._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_2._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_3._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_3._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_4._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_4._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_5._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_5._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_7._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_7._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_8._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_8._Compressed.txt
--- a/images/1626033496_437803/1626033496_437803_9._Compressed.txt
+++ b/images/1626033496_437803/1626033496_437803_9._Compressed.txt