Python Voice Communication

Python Voice Communication - python

Hello I am trying to figure out some code which is suppose to send voice over the network. I am having problems with the audio it sends but its just a series of loud beeps and not the audio I input
After the beeps are finished I get an EOFError
I have spent the last 48 hours trying to figure this out any ideas are greatly appreciated
The relevant code
import pyaudio
import speex
import sys
chunk = 320
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
### Server function ###
def server():
### Initialize socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((HOST, PORT))
s.listen(5)
### Start recieve loop
while True:
...
elif cmd == CMD_AUDIO:
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
#voice = cPickle.loads(decrypt_my_message(msg))
voice = cPickle.loads(msg)
print voice
for i in range(len(voice)):
decdata = d.decode(voice[i])#DECODE my data. (YaY)#DECODE my data. (YaY)
stream.write(str(voice), chunk) #Write the data back out to the speakers
stream.stop_stream()
stream.close()
p.terminate()
d.destroy()
if not msg: break
conn.close()
### READ DATA FROM THE MIC ###
def sendAudio():
chunklist = []
init_my_audio = speex.Encoder()
init_my_audio.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
for i in range(0, 44100 / chunk * RECORD_SECONDS):
try:
data = stream.read(chunk)
except IOError:
pass
encdata = init_my_audio.encode(data)
chunklist.append(encdata)
client(chr(CMD_AUDIO), cPickle.dumps((chunklist), 1))
stream.stop_stream()
stream.close()
p.terminate()
init_my_audio.destroy()

Related

Transmitting Audio stream with Pyaudio and RTP

I have been trying to transmit an audio stream over the network with pyaudio using RTP/UDP.
Since I could find any RTP library so I did my own encoding with some help from articles online.
The problem I am facing right now is when I don't use the RTP library everything works well but when I use the RTP encoding there is some latency issue, I receive chopped-up audio on the receiver side.
I am sharing sender , reveiver and rtp code here.
Any help is appreciated.
Sender_rtp.py
import pyaudio
import sys
import socket
import datetime
import pyrtp_2 as rtp
import random
HOST = sys.argv[1]
PORT = sys.argv[2]
data = bytes() # Stream of audio bytes
CHUNK_SIZE = 1024
BROADCAST_SIZE = 1024
CHANNELS = 1
FORMAT = pyaudio.paInt16 # 2 bytes size
RATE = 16000
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# define callback (2)
def pyaudio_callback(in_data, frame_count, time_info, status):
global data
data += in_data
return (None, pyaudio.paContinue)
# open stream (3)
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
stream_callback=pyaudio_callback)
# start the stream (4)
stream.start_stream()
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
#sock.bind((HOST, int(PORT)))
def send_data():
global data
print()
if (len(data) > BROADCAST_SIZE):
packet_vars = {'version' : 2,
'padding' : 0,
'extension' : 0,
'csi_count' : 0,
'marker' : 0,
'payload_type' : 97,
'sequence_number' : random.randint(1,9999),
'timestamp' : random.randint(1,9999),
'ssrc' : 185755418,
'payload' : data}
rtp_packet = rtp.GenerateRTP(packet_vars)
sock.sendto(rtp_packet[:BROADCAST_SIZE], (HOST, int(PORT)))
data = data[BROADCAST_SIZE:]
print(f'Sent {str(BROADCAST_SIZE)} bytes of audio. {datetime.datetime.now().time()}')
try:
while True:
send_data()
except KeyboardInterrupt:
print('\nClosing stream...')
stream.stop_stream()
stream.close()
p.terminate()
sock.close()
Receiver_rtp.py
import pyaudio
import sys
import socket
import pyrtp_2 as rtp
HOST = sys.argv[1]
PORT = sys.argv[2]
data = bytes() # Stream of audio bytes
is_receiving = False
CHUNK_SIZE = 1024 # Size of frame window to write audio (frames_per_buffer)
BROADCAST_SIZE = 1024 # Socket receives audio with this size
BUFFER_SIZE = BROADCAST_SIZE * 4 # Receive this amount of data before playback
CHANNELS = 1
FORMAT = pyaudio.paInt16 # 2 bytes size
RATE = 16000
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# define callback (2)
def pyaudio_callback(in_data, frame_count, time_info, status):
if not is_receiving:
return (bytes([0] * frame_count * CHANNELS * 2), pyaudio.paContinue)
global data
try:
# Cut the data, if it started to bufferize
if len(data) >= BUFFER_SIZE * 2:
print('Cutting Audio Buffer..')
data = data[-BUFFER_SIZE:]
avail_data_count = min(frame_count * CHANNELS * 2, len(data))
return_data = data[:avail_data_count]
data = data[avail_data_count:]
# Inflate end of the array with zeros, if there is not enough audio.
return_data += bytes([0] * (frame_count * CHANNELS * 2 - avail_data_count))
return (return_data, pyaudio.paContinue)
except:
print('Exception in pyaudio_callback...')
# open stream (3)
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
output=True,
frames_per_buffer=CHUNK_SIZE,
stream_callback=pyaudio_callback)
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.bind((HOST, int(PORT)))
#sock.listen(1)
#connection, client_address = sock.accept()
print('Socket bind succeed.')
try:
while True:
new_data = sock.recv(BROADCAST_SIZE)
print(f"Incoming raw data : {type(new_data)}")
rtp_packet = rtp.DecodeRTP(new_data)
payload = rtp_packet['payload']
##break
data += rtp_packet['payload']
if len(data) >= BUFFER_SIZE and not is_receiving:
is_receiving = True
# start stream (4)
stream.start_stream()
print(f'Stream started, when {len(data)} bytes of data were received.\nThis causes {str(len(data) / RATE)} seconds of latency')
except KeyboardInterrupt:
print('\nClosing socket and stream...')
sock.close()
stream.stop_stream()
stream.close()
p.terminate()
pyrtp.py
def GenerateRTP(packet_vars):
#The first twelve octates are present in every RTP packet.
#The first octet is the version number of the RTP packet.
#The second octet is the padding bit.
#The third octet is the extension bit.
#The fourth octet is the CSRC count.
#The fifth octet is the marker bit.
#The sixth octet is the payload type.
#The seventh to twelve octets are the sequence number.
#The thirteen to eighteen octets are the timestamp.
#The nineteen to twenty-four octets are the synchronization source (SSRC).
#The remaining octets are the payload data.
#Generate fist byte of the header a binary string:
version = format(packet_vars['version'], 'b').zfill(2)
padding = format(packet_vars['padding'], 'b')
extension = format(packet_vars['extension'], 'b')
csrc_count = format(packet_vars['csi_count'], 'b').zfill(4)
byte1 = format(int((version + padding + extension + csrc_count), 2), 'x').zfill(2)
#Generate second byte of the header as binary string:
marker = format(packet_vars['marker'], 'b')
payload_type = format(packet_vars['payload_type'], 'b').zfill(7)
byte2 = format(int((marker + payload_type), 2), 'x').zfill(2)
sequence_number = format(packet_vars['sequence_number'], 'x').zfill(4)
timestamp = format(packet_vars['timestamp'], 'x').zfill(8)
ssrc = format(packet_vars['ssrc'], 'x').zfill(8)
payload = packet_vars['payload'].hex()
packet = byte1 + byte2 + sequence_number + timestamp + ssrc + payload
return packet.encode()
def DecodeRTP(packet_bytes):
#return dict of variables from the packet
packet_vars = {}
byte1 = packet_bytes[0:2]
byte1 = int(byte1, 16)
byte1 = format(byte1, 'b').zfill(8)
packet_vars['version'] = int(byte1[0:2],2)
packet_vars['padding'] = int(byte1[2:3],2)
packet_vars['extension'] = int(byte1[3:4])
packet_vars['csi_count'] = int(byte1[4:8], 2)
byte2 = packet_bytes[2:4]
byte2 = int(byte2, 16)
byte2 = format(byte2, 'b').zfill(8)
packet_vars['marker'] = int(byte2[0:1])
packet_vars['payload_type'] = int(byte2[1:8], 2)
packet_vars['sequence_number'] = int(packet_bytes[4:8], 16)
packet_vars['timestamp'] = int(packet_bytes[8:16], 16)
packet_vars['ssrc'] = int(packet_bytes[16:24], 16)
payload = packet_bytes[24:]
packet_vars['payload'] = bytes.fromhex(payload.decode())
#print(f"payload 4: {type(payload)} {payload}")
return packet_vars

Record inside sound(output from software synthesizer) not from microphone

I record the sound by pyaudio like this
p = pyaudio.PyAudio()
hostAPICount = p.get_host_api_count()
print("Host API Count = " + str(hostAPICount))
for i in range(p.get_device_count()):
print(p.get_device_info_by_index(i))
# check the device.
# 0 -> microphone 1-> headphone
DEVICE_INDEX = 0 #or 1
CHUNK = 1024
FORMAT = pyaudio.paInt16 # 16bit
CHANNELS = 1
RATE = 48000 # sampling frequency [Hz]
time_ = 5 # record time [s]
output_path = "./sample.wav"
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
input_device_index = DEVICE_INDEX,
frames_per_buffer=CHUNK)
print("recording ...")
frames = []
for i in range(0, int(RATE / CHUNK * time_)):
data = stream.read(CHUNK)
frames.append(data)
print("done.")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(output_path, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
It works well for Microphone
However I want to record the sound played in my computer.(Synthesizer Application output it uses CoreAudio on MacOX)
So I changed the device number
DEVICE_INDEX = 0 -> DEVICE_INDEX = 1
But this error message appears
OSError: [Errno -9998] Invalid number of channels
Also changed the channel but in vain, the same message appears
CHANNELS = 1 -> CHANNELS = 2
How can I record the audio which is played from local application??
Is it possible?

I came across this problem while ago and honestly wasn't able to solve it by coding but by redirecting speaker sound to an input with https://vb-audio.com/Cable/. You just need to find the right DEVICE_INDEX by this little check I made earlier. (I think the name of the input is still the same "CABLE Output (VB-Audio Virtual ")
import pyaudio
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
if p.get_device_info_by_index(i)["name"] == "CABLE Output (VB-Audio Virtual ":
print(p.get_device_info_by_index(i)["index"])
p.terminate()
input("Click to finish")

Continuous Real Time Speech to Text with Watson for Python

I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here.
This is the code I have came up with but it gets the text after I finish recording:
import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
data_feed = b''.join(frames)
speech_to_text = SpeechToTextV1(
username='secret',
password='secret too',
x_watson_learning_opt_out=False
)
result = speech_to_text.recognize(data_feed,
content_type="audio/l16;rate=44100;channels=2",
word_confidence=True,
max_alternatives=4,
word_alternatives_threshold=0.5,
model="en-US_BroadbandModel",
continuous=True)
j = json.dumps(result, indent=2)
print(j)

I went ahead and created a program from scratch to connect to the Watson server using websockets. It still isn't doing exactly what I expect but it is very close.
The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.
import asyncio
import websockets
import json
import requests
import pyaudio
import time
# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
p = pyaudio.PyAudio()
# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# BlueMix app credentials
username = "" # Your Bluemix App username
password = "" # Your Bluemix App password
# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}
# Params to use for Watson API
params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": True
}
# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=CHUNK)
async def send_audio(ws):
# Starts recording of microphone
print("* READY *")
start = time.time()
while True:
try:
print(".")
data = stream.read(CHUNK)
await ws.send(data)
if time.time() - start > 20: # Records for n seconds
await ws.send(json.dumps({'action': 'stop'}))
return False
except Exception as e:
print(e)
return False
# Stop the stream and terminate the recording
stream.stop_stream()
stream.close()
p.terminate()
async def speech_to_text():
async with websockets.connect(url, extra_headers=token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(send_audio(conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(transcript)
#print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
#conn.close()
#return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()
So all I want now is to get the transcript while I am recording through the microphone.

Recording and playing byte list in pyaudio/wave

I want to record and play my voice using pyaudio and wave lib but I don't know how to do it because wave lib requires a path to a file and even if I'm trying to set it as a variable with list of bytes recorded a few second ago, still doesn't work beacuse I can't use 'read' for a list. Does someone have some idea? I want to make a looper like KORG stuff, etc
I want to play it immediately after stopped recording, like real looper, without saving record as file.
There is my code (Python 3.4):
def record(self): #recording a voice
#var for bytes from recording
self.stream = self.player.open(format = self.FORMAT,
channels = self.CHANNELS,
rate = self.RATE,
input = True,
frames_per_buffer = self.CHUNK)
print("Recording")
self.frames = [] #byte list
#recoring for a few seconds (5sec at this moment)
for i in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)):
self.data = self.stream.read(self.CHUNK) #sing stream do data var
self.frames.append(self.data) #add bytes to the end of a list
print("Stop recording")
self.stopRecording()
def stopRecording(self):
self.stream.stop_stream()
self.stream.close()
print("Recording has been stopped")
self.play()
def play(self): #playing a record
print("Playing")
f = wave.open(self.frames,"rb")
#read data
data = f.readframes(CHUNK)
#play stream
while data != '':
self.stream.write(data)
data = f.readframes(CHUNK)
self.stopPlaying()

After stop your record you need join your appended data, use data = ''.join(self.frames), and at the end build a loop (for, while) to stream all your byte list, here is how i did:
import pyaudio
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 41000
RECORD_SECONDS = 5
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
print ("***Recording***")
all = []
for i in range(0, RATE / chunk * RECORD_SECONDS):
data = stream.read(chunk)
all.append(data)
print("***Stop recording***")
print ("***START PLAY***")
data = ''.join(all)
for i in range(0, len(data), chunk):
stream.write(data[i:i+chunk])

Having issue with pyaudio and sockets

I am trying to fix a script I wrote. It's a client application and I need to figure out how to use sockets to record data from a computer running linux in my office.
I am using netcat for the server, listening on port 5555.
I know I have to convert the i to a integer, but am having trouble.
I already have a ftp script for sending the .wav file I just need to get the s.recv prompt to work.
import pyaudio
import wave
from socket import*
s = socket()
host = "127.0.0.1"
port = 5555
s.connect((host,port))
s.send("how many seconds?\n")
i = s.recv(2)
CHUNK = 1024
FORMAT = pyaudio.paInt16 #paInt8
CHANNELS = 2
RATE = 44100 #sample rate
RECORD_SECONDS = i
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK) #buffer
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data) # 2 bytes(16 bits) per channel
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

I recommend you rename i to USER_INPUT to prevent a name clash with the for-loop below. Then convert convert that value to an int using the in-built int(), like so:
s.send("how many seconds?\n")
USER_INPUT = s.recv(2)
RECORD_SECONDS = int(USER_INPUT) # TODO: handle invalid user input
You could confirm this value has been converted, then:
s.send("Shall record for %d seconds\n" % RECORD_SECONDS)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Voice Communication - python

Related

Transmitting Audio stream with Pyaudio and RTP

Record inside sound(output from software synthesizer) not from microphone

Continuous Real Time Speech to Text with Watson for Python

Recording and playing byte list in pyaudio/wave

Having issue with pyaudio and sockets

Categories

Resources