Am re-writing a program from previous question, and I am having trouble with it. Please see code:
#!/usr/bin/python
import subprocess,time, timeit
from multiprocessing import Process, Queue
import re, os, pprint, math
from collections import defaultdict
Dict = {}
identifier = ""
hexbits = []
count = defaultdict(int)
def __ReadRX__(RX_info):
lines = iter(RX_info.stdout.readline, "")
try:
start = time.clock()
for line in lines:
if re.match(r"^\d+.*$",line):
splitline = line.split()
del splitline[1:4]
identifier = splitline[1]
count[identifier] += 1
end = time.clock()
timing = round((end - start) * 10000, 100)
dlc = splitline[2]
hexbits = splitline[3:]
Dict[identifier] = [dlc, hexbits, count[identifier],int(timing)]
start = end
except keyboardinterrupt:
pass
procRX = subprocess.Popen('receivetest -f=/dev/pcan32'.split(), stdout=subprocess.PIPE)
if __name__ == '__main__':
munchCan = Process(target=__ReadRX__, args=(procRX))
munchCan.start()
munchCan.join()
print Dict
When trying to run the code I get the following error:
File "./cancheck2.py", line 36, in <module>
munchCan = Process(target=__ReadRx__, args=(procRX))
File "/usr/lib/python2.7/multiprocessing/process.py", line 104, in __init__
self._args = tuple(args)
TypeError: 'Popen' objec is not iterable
This code worked before I seperated the subprocess and set up __ReadRX__ as a seperate process.
Coudl anyone explain what is happening as I don't quite understand?
(procRX) doesn't create a tuple, you have to use (procRX,).
Related
Traceback (most recent call last):
File "C:\xampp\htdocs\Plag\scripts\main.py", line 8, in
from extractdocx import *
File "C:\xampp\htdocs\Plag\scripts\extractdocx.py", line 18, in
from docx import opendocx, getdocumenttext
File "C:\Users\zeesh\AppData\Local\Programs\Python\Python39\lib\site-packages\docx.py", line 30, in
from exceptions import PendingDeprecationWarning
ModuleNotFoundError: No module named 'exceptions'
# -*- coding: utf-8 -*-
# Master script for the plagiarism-checker
# Coded by: Shashank S Rao
#import other modules
from cosineSim import *
from htmlstrip import *
from extractdocx import *
#import required modules
import codecs
import traceback
import sys
import operator
import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
import json as simplejson
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def getQueries(text,n):
import re
sentenceEnders = re.compile('[.!?]')
sentenceList = sentenceEnders.split(text)
sentencesplits = []
for sentence in sentenceList:
x = re.compile(r'\W+', re.UNICODE).split(sentence)
x = [ele for ele in x if ele != '']
sentencesplits.append(x)
finalq = []
for sentence in sentencesplits:
l = len(sentence)
l=l/n
index = 0
for i in range(0,l):
finalq.append(sentence[index:index+n])
index = index + n-1
if index !=len(sentence):
finalq.append(sentence[len(sentence)-index:len(sentence)])
return finalq
# Search the web for the plagiarised text
# Calculate the cosineSimilarity of the given query vs matched content on google
# This is returned as 2 dictionaries
def searchWeb(text,output,c):
try:
text = text.encode('utf-8')
except:
text = text
query = urllib.parse.quote_plus(text)
if len(query)>60:
return output,c
#using googleapis for searching web
base_url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q='
url = base_url + '%22' + query + '%22'
request = urllib.request.Request(url,None,{'Referer':'Google Chrome'})
response = urllib.request.urlopen(request)
results = simplejson.load(response)
try:
if ( len(results) and 'responseData' in results and 'results' in results['responseData'] and results['responseData']['results'] != []):
for ele in results['responseData']['results']:
Match = results['responseData']['results'][0]
content = Match['content']
if Match['url'] in output:
#print text
#print strip_tags(content)
output[Match['url']] = output[Match['url']] + 1
c[Match['url']] = (c[Match['url']]*(output[Match['url']] - 1) + cosineSim(text,strip_tags(content)))/(output[Match['url']])
else:
output[Match['url']] = 1
c[Match['url']] = cosineSim(text,strip_tags(content))
except:
return output,c
return output,c
# Use the main function to scrutinize a file for
# plagiarism
def main():
# n-grams N VALUE SET HERE
n=9
if len(sys.argv) <3:
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
if sys.argv[1].endswith(".docx"):
t = docxExtract(sys.argv[1])
else:
t=open(sys.argv[1],'r')
if not t:
print ("Invalid Filename")
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
t=t.read()
queries = getQueries(t,n)
q = [' '.join(d) for d in queries]
found = []
#using 2 dictionaries: c and output
#output is used to store the url as key and number of occurences of that url in different searches as value
#c is used to store url as key and sum of all the cosine similarities of all matches as value
output = {}
c = {}
i=1
count = len(q)
if count>100:
count=100
for s in q[:100]:
output,c=searchWeb(s,output,c)
msg = "\r"+str(i)+"/"+str(count)+"completed..."
sys.stdout.write(msg);
sys.stdout.flush()
i=i+1
#print ("\n")
f = open(sys.argv[2],"w")
for ele in sorted(iter(c.items()),key=operator.itemgetter(1),reverse=True):
f.write(str(ele[0])+" "+str(ele[1]*100.00))
f.write("\n")
f.close()
print ("\nDone!")
if __name__ == "__main__":
try:
main()
except:
#writing the error to stdout for better error detection
error = traceback.format_exc()
print(("\nUh Oh!\n"+"Plagiarism-Checker encountered an error!:\n"+error)) ```
docx, last release was in 2014. The code imports module exceptions that was a top-level module in Python 2.7 but was removed in Python 3:
$ python2.7 -c "import exceptions"
$ python3.7 -c "import exceptions"
Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'exceptions'
The bottom line: the package is only for Python 2. Use Python 2.7 or find a different package.
I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?
i am trying to add a progressbar to my script but i couldn't succeed because i think it is multi-threaded or maybe it should be added in a separate thread . i found plenty of solutions in stackoverflow , for example tqdm library but i couldn't implement it , also i think i have a confusion where exactly i have to implement the progress bar code to make it works.
this is my code :
# -*- coding: utf-8 -*
from __future__ import unicode_literals
# !/usr/bin/python
import codecs
from multiprocessing.dummy import Pool
start_raw = "myfile"
threads = 10
with codecs.open(start_raw, mode='r', encoding='ascii', errors='ignore') as f:
lists = f.read().splitlines()
lists = list((lists))
def myfunction(x):
try:
print x
except:
pass
def Main():
try:
pp = Pool(int(threads))
pr = pp.map(myfunction, lists)
except:
pass
if __name__ == '__main__':
Main()
i have tried this solution
https://stackoverflow.com/a/45276885/9746396 :
# -*- coding: utf-8 -*
from __future__ import unicode_literals
# !/usr/bin/python
import codecs
from multiprocessing.dummy import Pool
import tqdm
start_raw = "myfile"
threads = 1
with codecs.open(start_raw, mode='r', encoding='ascii', errors='ignore') as f:
lists = f.read().splitlines()
lists = list((lists))
def myfunction(x):
try:
print (x)
except:
pass
def Main():
try:
pp = Pool(int(threads))
pr = pp.map(myfunction, lists)
except:
pass
if __name__ == '__main__':
with Pool(2) as p:
r = list(tqdm.tqdm(p.imap(Main(), range(30)), total=30))
but code does not run and i get exception (TypeError: 'NoneType' object is not callable)
0%| | 0/30 [00:00<?, ?it/s]Traceback (most recent call last):
File "file.py", line 35, in <module>
r = list(tqdm.tqdm(p.imap(Main(), range(30)), total=30))
File "C:\mypath\Python\Python38-32\lib\site-packages\tqdm\std.py", line 1118, in __iter__
for obj in iterable:
File "C:\mypath\Python\Python38-32\lib\multiprocessing\pool.py", line 865, in next
raise value
File "C:\mypath\Python\Python38-32\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
TypeError: 'NoneType' object is not callable
0%| | 0/30 [00:00<?, ?it/s]
I presume you wanted to pass myfunction instead of Main to imap, consistently with the first example.
When you pass Main() to p.imap in r = list(tqdm.tqdm(p.imap(Main(), range(30)), total=30)), Python calls executes Main method and passes the return value as the first argument to imap.
You should remove the parentheses after Main as: p.imap in r = list(tqdm.tqdm(p.imap(Main, range(30)), total=30)).
I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
So I am learning python and redoing some old projects. This project involves taking in a dictionary and a message to be translated from the command line, and translating the message. (For example: "btw, hello how r u" would be translated to "by the way, hello how are you".
We are using a scanner supplied by the professor to read in tokens and strings. If necessary I can post it here too. Heres my error:
Nathans-Air-4:py1 Nathan$ python translate.py test.xlt test.msg
Traceback (most recent call last):
File "translate.py", line 26, in <module>
main()
File "translate.py", line 13, in main
dictionary,count = makeDictionary(commandDict)
File "/Users/Nathan/cs150/extra/py1/support.py", line 12, in makeDictionary
string = s.readstring()
File "/Users/Nathan/cs150/extra/py1/scanner.py", line 105, in readstring
return self._getString()
File "/Users/Nathan/cs150/extra/py1/scanner.py", line 251, in _getString
if (delimiter == chr(0x2018)):
ValueError: chr() arg not in range(256)
Heres my main translate.py file:
from support import *
from scanner import *
import sys
def main():
arguments = len(sys.argv)
if arguments != 3:
print'Need two arguments!\n'
exit(1)
commandDict = sys.argv[1]
commandMessage = sys.argv[2]
dictionary,count = makeDictionary(commandDict)
message,messageCount = makeMessage(commandMessage)
print(dictionary)
print(message)
i = 0
while count < messageCount:
translation = translate(message[i],dictionary,messageCount)
print(translation)
count = count + 1
i = i +1
main()
And here is my support.py file I am using...
from scanner import *
def makeDictionary(filename):
fp = open(filename,"r")
s = Scanner(filename)
lyst = []
token = s.readtoken()
count = 0
while (token != ""):
lyst.append(token)
string = s.readstring()
count = count+1
lyst.append(string)
token = s.readtoken()
return lyst,count
def translate(word,dictionary,count):
i = 0
while i != count:
if word == dictionary[i]:
return dictionary[i+1]
i = i+1
else:
return word
i = i+1
return 0
def makeMessage(filename):
fp = open(filename,"r")
s = Scanner(filename)
lyst2 = []
string = s.readtoken()
count = 0
while (string != ""):
lyst2.append(string)
string = s.readtoken()
count = count + 1
return lyst2,count
Does anyone know whats going on here? I've looked through several times and i dont know why readString is throwing this error... Its probably something stupid i missed
chr(0x2018) will work if you use Python 3.
You have code that's written for Python 3 but you run it with Python 2. In Python 2 chr will give you a one character string in the ASCII range. This is an 8-bit string, so the maximum parameter value for chris 255. In Python 3 you'll get a unicode character and unicode code points can go up to much higher values.
The issue is that the character you're converting using chr isn't within the range accepted (range(256)). The value 0x2018 in decimal is 8216.
Check out unichr, and also see chr.