Hello I am trying to figure out some code which is suppose to send voice over the network. I am having problems with the audio it sends but its just a series of loud beeps and not the audio I input
After the beeps are finished I get an EOFError
I have spent the last 48 hours trying to figure this out any ideas are greatly appreciated
The relevant code
import pyaudio
import speex
import sys
chunk = 320
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
### Server function ###
def server():
### Initialize socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((HOST, PORT))
s.listen(5)
### Start recieve loop
while True:
...
elif cmd == CMD_AUDIO:
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
#voice = cPickle.loads(decrypt_my_message(msg))
voice = cPickle.loads(msg)
print voice
for i in range(len(voice)):
decdata = d.decode(voice[i])#DECODE my data. (YaY)#DECODE my data. (YaY)
stream.write(str(voice), chunk) #Write the data back out to the speakers
stream.stop_stream()
stream.close()
p.terminate()
d.destroy()
if not msg: break
conn.close()
### READ DATA FROM THE MIC ###
def sendAudio():
chunklist = []
init_my_audio = speex.Encoder()
init_my_audio.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
for i in range(0, 44100 / chunk * RECORD_SECONDS):
try:
data = stream.read(chunk)
except IOError:
pass
encdata = init_my_audio.encode(data)
chunklist.append(encdata)
client(chr(CMD_AUDIO), cPickle.dumps((chunklist), 1))
stream.stop_stream()
stream.close()
p.terminate()
init_my_audio.destroy()
Related
I have been trying to transmit an audio stream over the network with pyaudio using RTP/UDP.
Since I could find any RTP library so I did my own encoding with some help from articles online.
The problem I am facing right now is when I don't use the RTP library everything works well but when I use the RTP encoding there is some latency issue, I receive chopped-up audio on the receiver side.
I am sharing sender , reveiver and rtp code here.
Any help is appreciated.
Sender_rtp.py
import pyaudio
import sys
import socket
import datetime
import pyrtp_2 as rtp
import random
HOST = sys.argv[1]
PORT = sys.argv[2]
data = bytes() # Stream of audio bytes
CHUNK_SIZE = 1024
BROADCAST_SIZE = 1024
CHANNELS = 1
FORMAT = pyaudio.paInt16 # 2 bytes size
RATE = 16000
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# define callback (2)
def pyaudio_callback(in_data, frame_count, time_info, status):
global data
data += in_data
return (None, pyaudio.paContinue)
# open stream (3)
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
stream_callback=pyaudio_callback)
# start the stream (4)
stream.start_stream()
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
#sock.bind((HOST, int(PORT)))
def send_data():
global data
print()
if (len(data) > BROADCAST_SIZE):
packet_vars = {'version' : 2,
'padding' : 0,
'extension' : 0,
'csi_count' : 0,
'marker' : 0,
'payload_type' : 97,
'sequence_number' : random.randint(1,9999),
'timestamp' : random.randint(1,9999),
'ssrc' : 185755418,
'payload' : data}
rtp_packet = rtp.GenerateRTP(packet_vars)
sock.sendto(rtp_packet[:BROADCAST_SIZE], (HOST, int(PORT)))
data = data[BROADCAST_SIZE:]
print(f'Sent {str(BROADCAST_SIZE)} bytes of audio. {datetime.datetime.now().time()}')
try:
while True:
send_data()
except KeyboardInterrupt:
print('\nClosing stream...')
stream.stop_stream()
stream.close()
p.terminate()
sock.close()
Receiver_rtp.py
import pyaudio
import sys
import socket
import pyrtp_2 as rtp
HOST = sys.argv[1]
PORT = sys.argv[2]
data = bytes() # Stream of audio bytes
is_receiving = False
CHUNK_SIZE = 1024 # Size of frame window to write audio (frames_per_buffer)
BROADCAST_SIZE = 1024 # Socket receives audio with this size
BUFFER_SIZE = BROADCAST_SIZE * 4 # Receive this amount of data before playback
CHANNELS = 1
FORMAT = pyaudio.paInt16 # 2 bytes size
RATE = 16000
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# define callback (2)
def pyaudio_callback(in_data, frame_count, time_info, status):
if not is_receiving:
return (bytes([0] * frame_count * CHANNELS * 2), pyaudio.paContinue)
global data
try:
# Cut the data, if it started to bufferize
if len(data) >= BUFFER_SIZE * 2:
print('Cutting Audio Buffer..')
data = data[-BUFFER_SIZE:]
avail_data_count = min(frame_count * CHANNELS * 2, len(data))
return_data = data[:avail_data_count]
data = data[avail_data_count:]
# Inflate end of the array with zeros, if there is not enough audio.
return_data += bytes([0] * (frame_count * CHANNELS * 2 - avail_data_count))
return (return_data, pyaudio.paContinue)
except:
print('Exception in pyaudio_callback...')
# open stream (3)
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
output=True,
frames_per_buffer=CHUNK_SIZE,
stream_callback=pyaudio_callback)
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.bind((HOST, int(PORT)))
#sock.listen(1)
#connection, client_address = sock.accept()
print('Socket bind succeed.')
try:
while True:
new_data = sock.recv(BROADCAST_SIZE)
print(f"Incoming raw data : {type(new_data)}")
rtp_packet = rtp.DecodeRTP(new_data)
payload = rtp_packet['payload']
##break
data += rtp_packet['payload']
if len(data) >= BUFFER_SIZE and not is_receiving:
is_receiving = True
# start stream (4)
stream.start_stream()
print(f'Stream started, when {len(data)} bytes of data were received.\nThis causes {str(len(data) / RATE)} seconds of latency')
except KeyboardInterrupt:
print('\nClosing socket and stream...')
sock.close()
stream.stop_stream()
stream.close()
p.terminate()
pyrtp.py
def GenerateRTP(packet_vars):
#The first twelve octates are present in every RTP packet.
#The first octet is the version number of the RTP packet.
#The second octet is the padding bit.
#The third octet is the extension bit.
#The fourth octet is the CSRC count.
#The fifth octet is the marker bit.
#The sixth octet is the payload type.
#The seventh to twelve octets are the sequence number.
#The thirteen to eighteen octets are the timestamp.
#The nineteen to twenty-four octets are the synchronization source (SSRC).
#The remaining octets are the payload data.
#Generate fist byte of the header a binary string:
version = format(packet_vars['version'], 'b').zfill(2)
padding = format(packet_vars['padding'], 'b')
extension = format(packet_vars['extension'], 'b')
csrc_count = format(packet_vars['csi_count'], 'b').zfill(4)
byte1 = format(int((version + padding + extension + csrc_count), 2), 'x').zfill(2)
#Generate second byte of the header as binary string:
marker = format(packet_vars['marker'], 'b')
payload_type = format(packet_vars['payload_type'], 'b').zfill(7)
byte2 = format(int((marker + payload_type), 2), 'x').zfill(2)
sequence_number = format(packet_vars['sequence_number'], 'x').zfill(4)
timestamp = format(packet_vars['timestamp'], 'x').zfill(8)
ssrc = format(packet_vars['ssrc'], 'x').zfill(8)
payload = packet_vars['payload'].hex()
packet = byte1 + byte2 + sequence_number + timestamp + ssrc + payload
return packet.encode()
def DecodeRTP(packet_bytes):
#return dict of variables from the packet
packet_vars = {}
byte1 = packet_bytes[0:2]
byte1 = int(byte1, 16)
byte1 = format(byte1, 'b').zfill(8)
packet_vars['version'] = int(byte1[0:2],2)
packet_vars['padding'] = int(byte1[2:3],2)
packet_vars['extension'] = int(byte1[3:4])
packet_vars['csi_count'] = int(byte1[4:8], 2)
byte2 = packet_bytes[2:4]
byte2 = int(byte2, 16)
byte2 = format(byte2, 'b').zfill(8)
packet_vars['marker'] = int(byte2[0:1])
packet_vars['payload_type'] = int(byte2[1:8], 2)
packet_vars['sequence_number'] = int(packet_bytes[4:8], 16)
packet_vars['timestamp'] = int(packet_bytes[8:16], 16)
packet_vars['ssrc'] = int(packet_bytes[16:24], 16)
payload = packet_bytes[24:]
packet_vars['payload'] = bytes.fromhex(payload.decode())
#print(f"payload 4: {type(payload)} {payload}")
return packet_vars
I record the sound by pyaudio like this
p = pyaudio.PyAudio()
hostAPICount = p.get_host_api_count()
print("Host API Count = " + str(hostAPICount))
for i in range(p.get_device_count()):
print(p.get_device_info_by_index(i))
# check the device.
# 0 -> microphone 1-> headphone
DEVICE_INDEX = 0 #or 1
CHUNK = 1024
FORMAT = pyaudio.paInt16 # 16bit
CHANNELS = 1
RATE = 48000 # sampling frequency [Hz]
time_ = 5 # record time [s]
output_path = "./sample.wav"
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
input_device_index = DEVICE_INDEX,
frames_per_buffer=CHUNK)
print("recording ...")
frames = []
for i in range(0, int(RATE / CHUNK * time_)):
data = stream.read(CHUNK)
frames.append(data)
print("done.")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(output_path, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
It works well for Microphone
However I want to record the sound played in my computer.(Synthesizer Application output it uses CoreAudio on MacOX)
So I changed the device number
DEVICE_INDEX = 0 -> DEVICE_INDEX = 1
But this error message appears
OSError: [Errno -9998] Invalid number of channels
Also changed the channel but in vain, the same message appears
CHANNELS = 1 -> CHANNELS = 2
How can I record the audio which is played from local application??
Is it possible?
I came across this problem while ago and honestly wasn't able to solve it by coding but by redirecting speaker sound to an input with https://vb-audio.com/Cable/. You just need to find the right DEVICE_INDEX by this little check I made earlier. (I think the name of the input is still the same "CABLE Output (VB-Audio Virtual ")
import pyaudio
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
if p.get_device_info_by_index(i)["name"] == "CABLE Output (VB-Audio Virtual ":
print(p.get_device_info_by_index(i)["index"])
p.terminate()
input("Click to finish")
I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here.
This is the code I have came up with but it gets the text after I finish recording:
import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
data_feed = b''.join(frames)
speech_to_text = SpeechToTextV1(
username='secret',
password='secret too',
x_watson_learning_opt_out=False
)
result = speech_to_text.recognize(data_feed,
content_type="audio/l16;rate=44100;channels=2",
word_confidence=True,
max_alternatives=4,
word_alternatives_threshold=0.5,
model="en-US_BroadbandModel",
continuous=True)
j = json.dumps(result, indent=2)
print(j)
I went ahead and created a program from scratch to connect to the Watson server using websockets. It still isn't doing exactly what I expect but it is very close.
The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.
import asyncio
import websockets
import json
import requests
import pyaudio
import time
# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
p = pyaudio.PyAudio()
# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# BlueMix app credentials
username = "" # Your Bluemix App username
password = "" # Your Bluemix App password
# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}
# Params to use for Watson API
params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": True
}
# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=CHUNK)
async def send_audio(ws):
# Starts recording of microphone
print("* READY *")
start = time.time()
while True:
try:
print(".")
data = stream.read(CHUNK)
await ws.send(data)
if time.time() - start > 20: # Records for n seconds
await ws.send(json.dumps({'action': 'stop'}))
return False
except Exception as e:
print(e)
return False
# Stop the stream and terminate the recording
stream.stop_stream()
stream.close()
p.terminate()
async def speech_to_text():
async with websockets.connect(url, extra_headers=token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(send_audio(conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(transcript)
#print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
#conn.close()
#return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()
So all I want now is to get the transcript while I am recording through the microphone.
I want to record and play my voice using pyaudio and wave lib but I don't know how to do it because wave lib requires a path to a file and even if I'm trying to set it as a variable with list of bytes recorded a few second ago, still doesn't work beacuse I can't use 'read' for a list. Does someone have some idea? I want to make a looper like KORG stuff, etc
I want to play it immediately after stopped recording, like real looper, without saving record as file.
There is my code (Python 3.4):
def record(self): #recording a voice
#var for bytes from recording
self.stream = self.player.open(format = self.FORMAT,
channels = self.CHANNELS,
rate = self.RATE,
input = True,
frames_per_buffer = self.CHUNK)
print("Recording")
self.frames = [] #byte list
#recoring for a few seconds (5sec at this moment)
for i in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)):
self.data = self.stream.read(self.CHUNK) #sing stream do data var
self.frames.append(self.data) #add bytes to the end of a list
print("Stop recording")
self.stopRecording()
def stopRecording(self):
self.stream.stop_stream()
self.stream.close()
print("Recording has been stopped")
self.play()
def play(self): #playing a record
print("Playing")
f = wave.open(self.frames,"rb")
#read data
data = f.readframes(CHUNK)
#play stream
while data != '':
self.stream.write(data)
data = f.readframes(CHUNK)
self.stopPlaying()
After stop your record you need join your appended data, use data = ''.join(self.frames), and at the end build a loop (for, while) to stream all your byte list, here is how i did:
import pyaudio
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 41000
RECORD_SECONDS = 5
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
print ("***Recording***")
all = []
for i in range(0, RATE / chunk * RECORD_SECONDS):
data = stream.read(chunk)
all.append(data)
print("***Stop recording***")
print ("***START PLAY***")
data = ''.join(all)
for i in range(0, len(data), chunk):
stream.write(data[i:i+chunk])
I am trying to fix a script I wrote. It's a client application and I need to figure out how to use sockets to record data from a computer running linux in my office.
I am using netcat for the server, listening on port 5555.
I know I have to convert the i to a integer, but am having trouble.
I already have a ftp script for sending the .wav file I just need to get the s.recv prompt to work.
import pyaudio
import wave
from socket import*
s = socket()
host = "127.0.0.1"
port = 5555
s.connect((host,port))
s.send("how many seconds?\n")
i = s.recv(2)
CHUNK = 1024
FORMAT = pyaudio.paInt16 #paInt8
CHANNELS = 2
RATE = 44100 #sample rate
RECORD_SECONDS = i
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK) #buffer
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data) # 2 bytes(16 bits) per channel
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
I recommend you rename i to USER_INPUT to prevent a name clash with the for-loop below. Then convert convert that value to an int using the in-built int(), like so:
s.send("how many seconds?\n")
USER_INPUT = s.recv(2)
RECORD_SECONDS = int(USER_INPUT) # TODO: handle invalid user input
You could confirm this value has been converted, then:
s.send("Shall record for %d seconds\n" % RECORD_SECONDS)