I've modified some code that takes a .wav file and turns it into a .png.
The original source for the .wav to .png conversion is from:
https://github.com/bobvanluijt/audio-convolutional-neural-network/blob/master/convertWavToPng.py
I've edited it so that instead the colors are sorted in a gradient fashion so that it looks like:
https://imgur.com/a/TSwEOdt
Here:
from PIL import Image
import wave, struct, sys, math
##
# Collect input
##
if sys.argv[1][-4:] != '.wav':
sys.exit("First argument should be a .wav file")
if sys.argv[2][-4:] != '.png':
sys.exit("Second argument should be a .png file")
##
# Conversion:
##
# Wave file needs to be 16 bit mono
waveFile = wave.open(sys.argv[1], 'r')
if waveFile.getnchannels() != 1:
sys.exit("ERROR: The wave file should be single channel (mono)")
imageRgbArray = list()
waveLength = waveFile.getnframes()
# Create the image size (based on the length)
imageSize = math.ceil(math.sqrt(waveLength))
# Loop through the wave file
for i in range(waveLength):
# Try to read frame, if not possible fill with 0x0
try:
waveData = waveFile.readframes(1)
data = struct.unpack("<h", waveData) # This loads the wave bit
convertedData = int(data[0]) + 32768
except:
convertedData = 0
pass
bits = 5
rgbData = tuple([(convertedData>>bits*i)&(2**bits-1) for i in range(3)])
rgbData = tuple(map(lambda x: x<<3, rgbData))
# Add the RGB value to the image array
imageRgbArray.append(rgbData)
# Create new image
im = Image.new('RGB', (int(imageSize), int(imageSize)))
# Add image data
im.putdata(list(sorted(imageRgbArray)))
# Save image
im.save(sys.argv[2])
But now I need to be able to convert the sorted .png back into a .wav file.
Luckily, I already have this to work with:
https://github.com/bobvanluijt/audio-convolutional-neural-network/blob/master/convertPngToWav.py
from PIL import Image
import wave, struct, sys, soundfile as Sndfile, numpy as np, math
##
# Collect input
##
if sys.argv[1][-4:] != '.png':
sys.exit("First argument should be a .png file")
if sys.argv[2][-4:] != '.wav':
sys.exit("Second argument should be a .wav file")
##
# Conversion:
##
# Open image
with Image.open(sys.argv[1]) as pngFile:
# Load image
pngAllPixels = pngFile.load()
# Set the counters that create the image
countX = 0
countY = 0
count = pngFile.size[0] * pngFile.size[1]
# Create the array which will contain all the bits
bitArray = list()
# Loop through the individual pixels
while count > 0:
# Set the location of the pixel that should be loaded
singlePixel = pngAllPixels[countX, countY]
# Get RGB vals and convert them to hex
singlePixelToHexString = '%02x%02x%02x' % (singlePixel[0], singlePixel[1], singlePixel[2])
# Break if end of file (0x0)
if singlePixelToHexString == "000000":
break # break because audio is < 44100 bit
# Convert hex string into actual hex
singlePixelToHex = hex(int("0x" + singlePixelToHexString.lstrip("0"), 16) + int("0x0", 16))
# This adds 16bit/2 (=32768) to the data and converts hex into a bit
singleBit = int(singlePixelToHex, 16) - 32768
# Append the single bit to the array
bitArray.append(singleBit)
# Run through the image and set x and y vals (goes to next row when ready)
if countX == (pngFile.size[0] - 1):
countX = 0
countY += 1
else:
countX += 1
count -= 1
# Convert the array into a Numpy array
bitArrayNp = np.array(bitArray, dtype=np.int16)
# Output the file
Sndfile.write(sys.argv[2], bitArrayNp, 44100, 'PCM_16')
I've been told I need a way to convert each 3-byte pixel color into a two byte number to turn it back into the original wav file.
I think that means changing
rgbData = tuple(map(lambda x: x<<3, rgbData)) back to
rgbData = tuple(map(lambda x: x<<2, rgbData))
But I'm not entirely sure how to implement that in the pngtowav.py file.
I'm new at this, so anything helps.
Related
I am trying to make a realtime High pass and a low pass filter using Python. What I mean is whenever I play any audio on my computer, I want the low-pass filter to filter the audio in realtime and pass that audio on the left channel (my subwoofer) of my audio amplifier and the High-pass filter to pass audio to my right channel of my audio amplifier (tweeter) and play them simultaneously on left and the right channel.
I was successful in creating a High-pass filter using Python, But it plays it on both channels and it has to be saved to a wav before it is played. I want it to take audio real time from the audio from my computer and then in realtime convert it to go the corresponding right or left audio channels.
Here is the code for the High-pass filter I found on stackoverflow:
import matplotlib.pyplot as plt
import numpy as np
import wave
import sys
import math
import contextlib
fname = 'Lil Nas X Industry Baby Lyrics ft Jack Harlow.wav'
outname = 'filtered.wav'
cutOffFrequency = 400.0
def running_mean(x, windowSize):
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[windowSize:] - cumsum[:-windowSize]) / windowSize
def interpret_wav(raw_bytes, n_frames, n_channels, sample_width, interleaved = True):
if sample_width == 1:
dtype = np.uint8 # unsigned char
elif sample_width == 2:
dtype = np.int16 # signed 2-byte short
else:
raise ValueError("Only supports 8 and 16 bit audio formats.")
channels = np.fromstring(raw_bytes, dtype=dtype)
if interleaved:
# channels are interleaved, i.e. sample N of channel M follows sample N of channel M-1 in raw data
channels.shape = (n_frames, n_channels)
channels = channels.T
else:
# channels are not interleaved. All samples from channel M occur before all samples from channel M-1
channels.shape = (n_channels, n_frames)
return channels
with contextlib.closing(wave.open(fname,'rb')) as spf:
sampleRate = spf.getframerate()
ampWidth = spf.getsampwidth()
nChannels = spf.getnchannels()
nFrames = spf.getnframes()
signal = spf.readframes(nFrames*nChannels)
spf.close()
channels = interpret_wav(signal, nFrames, nChannels, ampWidth, True)
freqRatio = (cutOffFrequency/sampleRate)
N = int(math.sqrt(0.196196 + freqRatio**2)/freqRatio)
filtered = running_mean(channels[0], N).astype(channels.dtype)
wav_file = wave.open(outname, "w")
wav_file.setparams((1, ampWidth, sampleRate, nFrames, spf.getcomptype(), spf.getcompname()))
wav_file.writeframes(filtered.tobytes('C'))
wav_file.close()
I created a bitmap font, basically a 256x256 png image where each character occupies 8x8 tile. I want to use it with Pillow as ImageFont but there's no info on this in Pillow docs. It says I can load bitmap fonts like this
font = ImageFont.load("arial.pil")
but "PIL uses its own font file format to store bitmap fonts." so I guess png file won't work. How can I tell PIL to use said bitmap and where each character is on it?
Not a complete answer, but too much for a comment, and it may be useful or spur someone else to work out the other 60% :-)
I may delete it if anyone else comes up with something better...
You can go to the Pillow repository on Github and download a ZIP file of the code.
If you go in there and nose around you will find two things that appear to work hand-in-hand, namely a .PIL file and a .PBM file.
In Tests/fonts there is a file called 10x20.pbm which is actually a PNG file if you look inside it. So, if you change its name to 10x20.png you can view it and it looks like this:
By the way, if you want to split that into 10x20 size chunks with one letter in each, you can use ImageMagick in Terminal like this:
convert 10x20.pbm -crop 10x20 char_%d.png
and you will get a bunch of files called char_0.png, char_1.png etc. The first 4 look like this:
If you look in src/PIL/FontFile.py there is this code that seems to know how to access/generate the metrics for a font:
#
# The Python Imaging Library
# $Id$
#
# base class for raster font file parsers
#
# history:
# 1997-06-05 fl created
# 1997-08-19 fl restrict image width
#
# Copyright (c) 1997-1998 by Secret Labs AB
# Copyright (c) 1997-1998 by Fredrik Lundh
#
# See the README file for information on usage and redistribution.
#
from __future__ import print_function
import os
from . import Image, _binary
WIDTH = 800
def puti16(fp, values):
# write network order (big-endian) 16-bit sequence
for v in values:
if v < 0:
v += 65536
fp.write(_binary.o16be(v))
##
# Base class for raster font file handlers.
class FontFile(object):
bitmap = None
def __init__(self):
self.info = {}
self.glyph = [None] * 256
def __getitem__(self, ix):
return self.glyph[ix]
def compile(self):
"Create metrics and bitmap"
if self.bitmap:
return
# create bitmap large enough to hold all data
h = w = maxwidth = 0
lines = 1
for glyph in self:
if glyph:
d, dst, src, im = glyph
h = max(h, src[3] - src[1])
w = w + (src[2] - src[0])
if w > WIDTH:
lines += 1
w = (src[2] - src[0])
maxwidth = max(maxwidth, w)
xsize = maxwidth
ysize = lines * h
if xsize == 0 and ysize == 0:
return ""
self.ysize = h
# paste glyphs into bitmap
self.bitmap = Image.new("1", (xsize, ysize))
self.metrics = [None] * 256
x = y = 0
for i in range(256):
glyph = self[i]
if glyph:
d, dst, src, im = glyph
xx = src[2] - src[0]
# yy = src[3] - src[1]
x0, y0 = x, y
x = x + xx
if x > WIDTH:
x, y = 0, y + h
x0, y0 = x, y
x = xx
s = src[0] + x0, src[1] + y0, src[2] + x0, src[3] + y0
self.bitmap.paste(im.crop(src), s)
self.metrics[i] = d, dst, s
def save(self, filename):
"Save font"
self.compile()
# font data
self.bitmap.save(os.path.splitext(filename)[0] + ".pbm", "PNG")
# font metrics
with open(os.path.splitext(filename)[0] + ".pil", "wb") as fp:
fp.write(b"PILfont\n")
fp.write((";;;;;;%d;\n" % self.ysize).encode('ascii')) # HACK!!!
fp.write(b"DATA\n")
for id in range(256):
m = self.metrics[id]
if not m:
puti16(fp, [0] * 10)
else:
puti16(fp, m[0] + m[1] + m[2])
So hopefully someone has time/knowledge of how to put those two together to enable you to generate the metrics file for your PNG. I think you just need something that does the last 10 lines of that code for your PNG.
There appear to be 23 bytes of header which you can simply replicate, and then there are 256 "entries", i.e. 1 for each of 256 glyphs. Each entry has 10 numbers in it, and each number is 16-bit big endian.
Let's look at the header:
dd if=10x20.pil bs=23 count=1| xxd -c23 | more
00000000: 5049 4c66 6f6e 740a 3b3b 3b3b 3b3b 3230 3b0a 4441 5441 0a PILfont.;;;;;;20;.DATA.
Then you can see the entries using the command below to skip the header and group nicely:
dd if=10x20.pil bs=23 iseek=1| xxd -g2 -c20
which gives:
Column 1 appears to be the width of the glyph.
Column 7 is the x-offset of the left edge of the glyph in the image and column 9 is the x-offset of the right edge of the glyph in the image. So you will see that column 7 on each line is the same as column 9 on the previous line, i.e. that the glyphs abutt each other going across the image.
If you look at this extract from further down the file, you can see it starts a new row of glyphs in the output image in the middle of the extract (marked in red). That tells us that the bitmap should be no more than 800 pixels wide and that column 8 is the y-offset of the top of the glyph in the bitmap file and column 10 is the y-offset of the bottom of the glyph in the bitmap. You should see that when a new line row of glyphs starts in the bitmap file that x goes to zero and column 8 takes the previous value from column 10.
Hello I am using Python to try to read the digit data provided by MNIST into a data structure I can use to train a neural network. I am testing to ensure the data was read properly by creating an image using PIL. The image that is being created is horribly wrong, and I am not sure if it is because I am using PIL incorrectly or my data structures and methods are not right.
The format of the two data files is described here:
http://yann.lecun.com/exdb/mnist/
Here are the applicable functions:
read_image_data reads the pixel data organizing it into a list of 2D array numpy arrays
def read_image_data():
fd = open("train-images.idx3-ubyte", "rb")
images_bin_string = fd.read()
num_images = struct.unpack(">i", images_bin_string[4:8])[0]
image_data_bank = []
uint32_num_bytes = 4
current_index = 8
num_rows = struct.unpack(">I", \
images_bin_string[current_index: current_index + uint32_num_bytes])[0]
num_cols = struct.unpack(">I", \
images_bin_string[current_index + uint32_num_bytes: \
current_index + uint32_num_bytes * 2])[0]
current_index += 8
i = 0
while i < num_images:
image_data = np.zeros([num_rows, num_cols])
for j in range(num_rows - 1):
for k in range(num_cols - 1):
image_data[j][k] = images_bin_string[current_index + j * k]
current_index += num_rows * num_cols
i += 1
image_data_bank.append(image_data)
return image_data_bank
read_label_data reads the corresponding labels into a list
def read_label_data():
fd = open("train-labels.idx1-ubyte", "rb")
images_bin_string = fd.read()
num_images = struct.unpack(">i", images_bin_string[4:8])[0]
image_data_bank = []
current_index = 8
i = 0
while i < num_images:
image_data_bank.append(images_bin_string[current_index])
current_index += 1
i += 1
return image_data_bank
collect_data zips the structures together
def collect_data():
print("Reading image data...")
image_data = read_image_data()
print("Reading label data...")
label_data = read_label_data()
print("Zipping data sets...")
all_data = np.array(list(zip(image_data, label_data)))
return all_data
lastly run_test uses PIL to print the pixels from the first 28x28 np structure created by read_image_data
def run_test(data):
example = data[0]
pixel_data = example[0]
number = example[1]
print(number)
im = Image.fromarray(pixel_data)
im.show()
When I run the script:
Collecting data... Reading image data... Reading label data... Zipping
data sets... 5
I must be messing something up with the PIL library, but I do not know what.
That is a really weird looking 5. I am guessing that I went wrong somewhere in my organization of the data. The directions did say "Pixels are organized row-wise.", but I think I covered that by having my outer loop as the row loop then the inner as the column loop
UPDATE
I reversed the order of the row and column index in the np.arrays in read_image_data and it is making no difference.
image_data[k][j] = images_bin_string[current_index + j * k]
UPDATE
Ran quick test with matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
imgplot = plt.imshow(pixel_data)
plt.show()
Here is what I got from matplotlib
That means it is definitely a problem with my code and not the library. The question is if it is the way I am passing the pixels to the imaging libraries or how I structured the data. If anyone can find the mistake, I would greatly appreciate.
To the following code happens a strange thing when executed: the percentage which shows completition is going over 100%. The code is running for about 45 minutes by now..
This is the code I adapted to Python 3 after the errors original one gave me.
'''
Read and pre-process SD19 characters text file.
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/
Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones
Following preprocessing is applied to the dataset:
- Read glyph (see read_glyph())
- Moment-based image deskew (see deskew())
- Trim zeros rows and columns (see trim_padding())
- Resize image while keeping aspect ratio (see resize_with_constant_ratio())
- Pad zeros in order to get a square image (see pad_digit())
Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/
Usage:
preprocess_sd19_text.py
'''
#
import os
import re
import sys
import pickle
import cv2
import numpy as np
import math
def read_glyph(_line):
"""Extract digit from the text file
Parameters
----------
_line : string
current line in SD19 text file
Returns
-------
digit : np.array
2D digit 128x128
label : int
the label
"""
match = re.search("^(\S+) (\d+)", _line)
label = match.group(1)
vector = list(match.group(2))
vector = [int(x) for x in vector]
label = ord(label)
label = str(symbol_map[label]) #changed from int to str
digit = np.array(vector, 'float32')
digit = (digit*-1.+1.).reshape(128, 128)
return digit, label
def deskew(img):
"""Deskew digit
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : Deskewed digit
"""
m = cv2.moments(img)
if abs(m['mu02']) < 1e-2:
return img.copy()
skew = m['mu11']/m['mu02']
rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
return img
def resize_with_constant_ratio(img, char_dim):
"""Resize image while keeping aspect ratio. Max dim is char_dim
pad_dim is applied in order to have derivative friendly image
Parameters
----------
img : np.array
2D digit array
char_dim : int
dst dim
Returns
-------
dst : resized digit
"""
roi_h = img.shape[0]
roi_w = img.shape[1]
max_dim = max(roi_w, roi_h)
pad_dim = 2
scale = float(char_dim-pad_dim) / max_dim
if roi_w >= roi_h:
new_w = int(char_dim-pad_dim)
new_h = int(roi_h * scale)
else:
new_w = int(roi_w * scale)
new_h = int(char_dim-pad_dim)
dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
return dst
def trim_padding(img):
"""Trim zeros rows and columns
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : trimmed digit
"""
mask_row = np.all(np.equal(img, 0), axis=1)
dst = img[~mask_row]
mask_col = np.all(np.equal(dst, 0), axis=0)
dst = dst[:, ~mask_col]
return dst
def pad_digit(img, char_dim):
"""Pad zeros in order to get a square char_dimxchar_dim image
Parameters
----------
img : np.array
2D digit array
char_dim : int
image dim
Returns
-------
dst : padded digit
"""
pad_h = char_dim-img.shape[0]
pad_w = char_dim-img.shape[1]
pad_h_b = math.floor(pad_h/2)
pad_h_t = pad_h - pad_h_b
pad_w_r = math.floor(pad_w/2)
pad_w_l = pad_w - pad_w_r
dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))
dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))
return dst
def print_overwrite(text):
"""Print with overwrite (for progression counter)
Parameters
----------
text : string
text to display
"""
delete = "\b" * (len (text)+1)
print ("{0}{1}".format(delete, text)),
if __name__ == '__main__':
print (__doc__)
sd19_filename = "sd19-binary_digits.txt"
data = open(sd19_filename, "r")
dataset = []
symbol_map = dict([(x, chr(x)) for x in list(range(48, 58)) + list(range(65, 91)) + list(range(97, 123))]) #added list() to every range
current_dir = os.curdir
num_records = 0
num_lines = 402953
char_dim=28
pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"
for line in data:
num_records += 1
if num_records%20000 == 0:
with open(os.path.join(current_dir, pickle_name +\
str(num_records) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print_overwrite("num_records : {}/{} - {:5.2f}%"\
.format(num_records, num_lines, num_records*1./num_lines*100))
digit, label = read_glyph(line)
digit_deskewed = deskew(digit)
digit_trimmed = trim_padding(digit_deskewed)
digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
digit_padded = pad_digit(digit_resized, char_dim)
item = []
item.append((digit_padded*255).astype('uint8'))
item.append(label)
dataset.append(item)
with open(os.path.join(current_dir, pickle_name +\
str(num_lines) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
It is used to create PICKLE from a .txt file containing binary images. For more see here.
And my "error"...
By now it is at 135% and last PICKLE file is around 400mb...
Why it happens ? Also, it continue to create files (should have stopped at 400.000 or little more).
After reading this and taking the courses, I am struggling to solve the second problem in assignment 1 (notMnist):
Let's verify that the data still looks good. Displaying a sample of the labels and images from the ndarray. Hint: you can use matplotlib.pyplot.
Here is what I tried:
import random
rand_smpl = [ train_datasets[i] for i in sorted(random.sample(xrange(len(train_datasets)), 1)) ]
print(rand_smpl)
filename = rand_smpl[0]
import pickle
loaded_pickle = pickle.load( open( filename, "r" ) )
image_size = 28 # Pixel width and height.
import numpy as np
dataset = np.ndarray(shape=(len(loaded_pickle), image_size, image_size),
dtype=np.float32)
import matplotlib.pyplot as plt
plt.plot(dataset[2])
plt.ylabel('some numbers')
plt.show()
but this is what I get:
which doesn't make much sense. To be honest my code may too, since I am not really sure how to tackle that problem!
The pickles are created like this:
image_size = 28 # Pixel width and height.
pixel_depth = 255.0 # Number of levels per pixel.
def load_letter(folder, min_num_images):
"""Load the data for a single letter label."""
image_files = os.listdir(folder)
dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
dtype=np.float32)
print(folder)
num_images = 0
for image in image_files:
image_file = os.path.join(folder, image)
try:
image_data = (ndimage.imread(image_file).astype(float) -
pixel_depth / 2) / pixel_depth
if image_data.shape != (image_size, image_size):
raise Exception('Unexpected image shape: %s' % str(image_data.shape))
dataset[num_images, :, :] = image_data
num_images = num_images + 1
except IOError as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
dataset = dataset[0:num_images, :, :]
if num_images < min_num_images:
raise Exception('Many fewer images than expected: %d < %d' %
(num_images, min_num_images))
print('Full dataset tensor:', dataset.shape)
print('Mean:', np.mean(dataset))
print('Standard deviation:', np.std(dataset))
return dataset
where that function is called like this:
dataset = load_letter(folder, min_num_images_per_class)
try:
with open(set_filename, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
The idea here is:
Now let's load the data in a more manageable format. Since, depending on your computer setup you might not be able to fit it all in memory, we'll load each class into a separate dataset, store them on disk and curate them independently. Later we'll merge them into a single dataset of manageable size.
We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road.
Do this as below:
#define a function to conver label to letter
def letter(i):
return 'abcdefghij'[i]
# you need a matplotlib inline to be able to show images in python notebook
%matplotlib inline
#some random number in range 0 - length of dataset
sample_idx = np.random.randint(0, len(train_dataset))
#now we show it
plt.imshow(train_dataset[sample_idx])
plt.title("Char " + letter(train_labels[sample_idx]))
Your code changed the type of dataset actually, it is not an ndarray of size (220000, 28,28)
In general, pickle is a file which holds some objects, not the array itself. You should use the object from pickle directly to get your train dataset (using the notation from your code snippet):
#will give you train_dataset and labels
train_dataset = loaded_pickle['train_dataset']
train_labels = loaded_pickle['train_labels']
UPDATED:
Per request from #gsarmas the link to my solution for whole Assignment1 lies here.
The code is commented and mostly self-explanatory, but in case of any questions feel free to contact via any way you prefer on github
Please check with this code
pickle_file = train_datasets[0]
with open(pickle_file, 'rb') as f:
# unpickle
letter_set = pickle.load(f)
# pick a random image index
sample_idx = np.random.randint(len(letter_set))
# extract a 2D slice
sample_image = letter_set[sample_idx, :, :]
plt.figure()
# display it
plt.imshow(sample_image)
Use this code:
#random select a letter
i = np.random.randint( len(train_datasets) )
plt.title( "abcdefghij"[i] )
#read the file of selected letter
f = open( train_datasets[i], "rb" )
f = pickle.load(f)
#random select an image in the file
j = np.random.randint( len(f) )
#show image
plt.axis('off')
img = plt.imshow( f[ j, :, : ] )
enter image description here