I am trying to use Python to create a tool for imaging CF cards with a Raspberry Pi.
I had most of it working until I implemented compressed images with dd.
When I try and pipe the output of gzip to ddI lose the ability to poke the dd process and get a progress.
I have tried to use multiple sub processes but keep getting broken pipe or no such file errors.
Below is my code:
#!/usr/bin/env python
from Adafruit_CharLCD import Adafruit_CharLCD
import os
import sys
import time
import signal
from subprocess import Popen, PIPE
lcd = Adafruit_CharLCD()
lcd.begin(16,2)
imgpth = '/home/pi/image/image_V11.img.gz'
line0 = ""
line1 = ""
q = 0
r = 0
s = 0
def lcdPrint(column, row, message, clear=False):
if ( clear == True ):
lcd.clear()
lcd.setCursor(column, row)
lcd.message(message)
lcd.clear()
lcdPrint(0, 0, 'Preparing Copy', True)
lcdPrint(0, 1, '')
gz = Popen(['gunzip -c /home/pi/image/image_V11.img.gz'], stdout=PIPE)
dd = Popen(['dd of=/dev/sda'],stdin=gz.stdout, stderr=PIPE)
filebyte = os.path.getsize(imgpth)
flsz = filebyte/1024000
while dd.poll() is None:
time.sleep(1)
dd.send_signal(signal.SIGUSR1)
while 1:
l = dd.stderr.readline()
if '(' in l:
param, value = l.split('b',1)
line1 = param.rstrip()
r = float(line1)
s = r/1024000
break
lcdPrint(0, 0, 'Copying....', True)
q = round(s/flsz*100, 2)
per = str(q)
lcdPrint(0, 1, per + '% Complete',)
lcdPrint(0, 0, 'Copy Complete', True)
time.sleep(1)
exit()
How can I fix this?
I stumbled across this question because I am doing exactly the same. My complete solution is here:
http://github.com/jrmhaig/Bakery
I've tried to pick out some differences between what I have and yours that might show you the solution.
When starting the dd I redirected both stderr and stdout to the pipe.
dd = subprocess.Popen(['dd', 'of=/dev/sda', 'bs=1M'], bufsize=1, stdin=unzip.stdout, stdout=PIPE, stderr=STDOUT)
I don't think this should really make a difference. Everything you need should go to stderr but for some reason it appeared to get mixed up for me.
I use a separate thread to pick up the output from dd:
def read_pipe(out, queue):
for line in iter(out.readline, b''):
queue.put(str(line))
out.close()
dd_queue = queue.Queue()
dd_thread = threading.Thread(target = read_pipe, args=(dd.stdout, dd_queue))
dd_thread.daemon = True
dd_thread.start()
Then when you call:
dd.send_signal(signal.SIGUSR1)
the output gets caught on dd_queue.
I also found that the uncompressed size of an gzipped file is stored in the last 4 bytes:
fl = open(str(imgpath), 'rb')
fl.seek(-4, 2)
r = fl.read()
fl.close()
size = struct.unpack('<I', r)[0]
os.path.getsize(imgpth) will only give you the compressed size so the percentage calculation will be wrong.
Related
Thanks to all for your time.
I'm trying to know if several server are up or down using ping, and it works . . . but when I try convert the result into a up or down, something is wrong and always is down.
Dunno what other thing I should try, don't need anything else, just up or down and the IP.
import os
import datetime
import platform
import subprocess
import string
date = datetime.datetime.now()
day = date.day
hour = date.hour
def writedoc ():
os.chdir ('Path')
wresult = open ("pingresults_{}_{}.txt".format(day,hour), 'a')
wresult.write ('{}-{}\n'.format(ips, rping))
wresult.close ()
os.chdir ('Path')
openips = open ("ips.txt","r")
ipfile = openips.readlines()
for ips in ipfile:
ips = ips.strip()
print (ips)
args = ["ping", "-n", "4", "-l", "1", "-w", "1000", ips]
pping = subprocess.Popen(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
rping = pping.stdout
for line in rping:
print (line)
if (rping.find("(100% perdidos)" != -1)):
result = "down"
print (result)
else:
result = "up"
print (result)
writedoc()
if (rping.find("(100% perdidos)" != -1))
Should this instead be
if (rping.find("(100% perdidos)") != -1)
So that this checks that rping.find("(100% perdidos)") does not return - 1.
With your example you are effectively calling rping.find(True) as
"(100% perdidos)" does not equal - 1.
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()
I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
I need a simple way to pass the stdout of a subprocess as a list to another function using multiprocess:
The first function that invokes subprocess:
def beginRecvTest():
command = ["receivetest","-f=/dev/pcan33"]
incoming = Popen(command, stdout = PIPE)
processing = iter(incoming.stdout.readline, "")
lines = list(processing)
return lines
The function that should receive lines:
def readByLine(lines):
i = 0
while (i < len(lines)):
system("clear")
if(lines[i][0].isdigit()):
line = lines[i].split()
dictAdd(line)
else:
next
print ; print "-" *80
for _i in mydict.keys():
printMsg(mydict, _i)
print "Keys: ", ; print mydict.keys()
print ; print "-" *80
sleep(0.3)
i += 1
and the main from my program:
if __name__ == "__main__":
dataStream = beginRecvTest()
p = Process(target=dataStream)
reader = Process(target=readByLine, args=(dataStream,))
p.start()
reader.start()
I've read up on using queues, but I don't think that's exactly what I need.
The subprocess called returns infinite data so some people have suggested using tempfile, but I am totally confused about how to do this.
At the moment the script only returns the first line read, and all efforts on looping the beginRecvTest() function have ended in compilation errors.
I'm trying to capture a string from the output of a subprocess and when the subprocess asks for user input, include the user input in the string, but I can't get stdout to work.
I got the string output from stdout using a while loop, but I don't know how to terminate it after reading the string.
I tried using subprocess.check_output, but then I can't see the prompts for user input.
import subprocess
import sys
child = subprocess.Popen(["java","findTheAverage"], stdout = subprocess.PIPE, stdin = subprocess.PIPE )
string = u""
while True:
line = str(child.stdout.read(1))
if line != '':
string += line[2]
print(string)
else:
break
print(string)
for line in sys.stdin:
print(line)
child.stdin.write(bytes(line, 'utf-8'))
EDIT:
With help and code from Alfe post I now have a string getting created from the subprocess programs output, and the users input to that program, but its jumbled about.
The string appears to first get The first letter of the output, then the user input, then the rest of the output.
Example of string muddling:
U2
3ser! please enter a double:U
4ser! please enter another double: U
5ser! please enter one final double: Your numbers were:
a = 2.0
b = 3.0
c = 4.0
average = 3.0
Is meant to be:
User! please enter a double:2
User! please enter another double: 3
User! please enter one final double: 4
Your numbers were:
a = 2.0
b = 3.0
c = 4.0
average = 3.0
Using the code:
import subprocess
import sys
import signal
import select
def signal_handler(signum, frame):
raise Exception("Timed out!")
child = subprocess.Popen(["java","findTheAverage"], universal_newlines = True, stdout = subprocess.PIPE, stdin = subprocess.PIPE )
string = u""
stringbuf = ""
while True:
print(child.poll())
if child.poll() != None and not stringbuf:
break
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(1)
try:
r, w, e = select.select([ child.stdout, sys.stdin ], [], [])
if child.stdout in r:
stringbuf = child.stdout.read(1)
string += stringbuf
print(stringbuf)
except:
print(string)
print(stringbuf)
if sys.stdin in r:
typed = sys.stdin.read(1)
child.stdin.write(typed)
string += typed
FINAL EDIT:
Alright, I played around with it and got it working with this code:
import subprocess
import sys
import select
import fcntl
import os
# the string that we will return filled with tasty program output and user input #
string = ""
# the subprocess running the program #
child = subprocess.Popen(["java","findTheAverage"],bufsize = 0, universal_newlines = True, stdout = subprocess.PIPE, stdin = subprocess.PIPE )
# stuff to stop IO blocks in child.stdout and sys.stdin ## (I stole if from http://stackoverflow.com/a/8980466/2674170)
fcntl.fcntl(child.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
fcntl.fcntl(sys.stdin.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
# this here in the unlikely event that the program has #
# finished by the time the main loop is first running #
# because if that happened the loop would end without #
# having added the programs output to the string! #
progout = ""
typedbuf = "#"
### here we have the main loop, this friendly fellah is
### going to read from the program and user, and tell
### each other what needs to be known
while True:
## stop when the program finishes and there is no more output
if child.poll() != None and not progout:
break
# read from
typed = ""
while typedbuf:
try:
typedbuf = sys.stdin.read(1)
except:
break
typed += typedbuf
stringbuf = "#"
string += typed
child.stdin.write(typed)
progout = ""
progoutbuf = "#"
while progoutbuf:
try:
progoutbuf = child.stdout.read(1)
except:
typedbuf = "#"
break
progout += progoutbuf
if progout:
print(progout)
string += progout
# the final output string #
print( string)
You need select to read from more than one source at the same time (in your case stdin and the output of the child process).
import select
string = ''
while True:
r, w, e = select.select([ child.stdout, sys.stdin ], [], [])
if child.stdout in r:
string += child.stdout.read()
if sys.stdin in r:
typed = sys.stdin.read()
child.stdin.write(typed)
string += typed
You will still need to find a proper breaking condition to leave that loop. But you probably get the idea already.
I want to give a warning at this point: Processes writing into pipes typically buffer until the latest possible moment; you might not expect this because when testing the same program from the command line (in a terminal) typically only lines get buffered. This is due to performance considerations. When writing to a terminal, typically a user expects to see the output as soon as possible. When writing to a pipe, typically a reading process is happy to be given larger chunks in order to sleep longer before they arrive.