I'm trying to write a web parser script using requests module. Here is my current code:
import requests
import subprocess
import json
import sys
import threading
import time
from Queue import Queue
numberOfViewers = int(sys.argv[1])
builderThreads = int(sys.argv[2])
startTime = time.time()
numberOfSockets = 0
concurrent = 25
urls = []
urlsUsed = []
def getURL(): # Get tokens
output = subprocess.Popen(["livestreamer", "twitch.tv/CHANNEL_NAME", "-j"],
stdout=subprocess.PIPE).communicate()[0]
return json.loads(output)['streams']['worst']['url'] # Parse json and return the URL parameter
def build(): # Builds a set of tokens, aka viewers
global numberOfSockets
global numberOfViewers
while True:
if numberOfSockets < numberOfViewers:
numberOfSockets += 1
print ("Building viewers " + str(numberOfSockets) + "/" + str(numberOfViewers))
urls.append(getURL())
def view(): # Opens connections to send views
global numberOfSockets
while True:
url=q.get()
requests.head(url)
if (url in urlsUsed):
urls.remove(url)
urlsUsed.remove(url)
numberOfSockets -= 1
else:
urlsUsed.append(url)
q.task_done()
if __name__ == '__main__':
for i in range(0, builderThreads):
threading.Thread(target = build).start()
while True:
while (numberOfViewers != numberOfSockets): # Wait until sockets are built
time.sleep(1)
q=Queue(concurrent*2)
for i in range(concurrent):
try:
t=threading.Thread(target=view)
t.daemon=True
t.start()
except:
print ('thread error')
try:
for url in urls:
print (url)
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
But when I run the code, it says:
Traceback (most recent call last):
File "C:\Users\flamelier\Desktop\Twitch.py", line 1, in <module>
import requests
ImportError: No module named 'requests'
Why am I getting this error? How do I install this module?
Will this error keep repeating for all the scripts henceforth?
How can I prevent such similar errors in the future?
Requests is a 3rd party module. You should first install it to Python using PIP or easy_install.
You have to run pip3 install requests as requests doesn't come with Python by default, as it is a third party library.
Even after you have pip3-installed requests, the code shown won't do anything. The
if __name__ == "__main__"
test and everything after it is part of an else block in the view function. Back this line and the block that follows out to the left margin.
Related
I'm trying to get the generated domain name or IP-address of flask_ngrok or py-ngrok after been deploy. I want to deploy flask_app to localhost and get the new IP-address or domain name on the main page.
I.E: If I access 127.0.0.1/ I want it to return something like
You can now log in through https://aaf8447ee878.ngrok.io/
I have tried checking through the directories and read some help but I can't still get it. Thanks in advance ❤
add
import atexit
import json
import os
import platform
import shutil
import subprocess
import tempfile
import time
import zipfile
from pathlib import Path
from threading import Timer
import requests
def _run_ngrok():
ngrok_path = str(Path(tempfile.gettempdir(), "ngrok"))
_download_ngrok(ngrok_path)
system = platform.system()
if system == "Darwin":
command = "ngrok"
elif system == "Windows":
command = "ngrok.exe"
elif system == "Linux":
command = "ngrok"
else:
raise Exception(f"{system} is not supported")
executable = str(Path(ngrok_path, command))
os.chmod(executable, 777)
ngrok = subprocess.Popen([executable, 'http', '5000'])
atexit.register(ngrok.terminate)
localhost_url = "http://localhost:4040/api/tunnels" # Url with tunnel details
time.sleep(1)
tunnel_url = requests.get(localhost_url).text # Get the tunnel information
j = json.loads(tunnel_url)
tunnel_url = j['tunnels'][0]['public_url'] # Do the parsing of the get
tunnel_url = tunnel_url.replace("https", "http")
return tunnel_url
def _download_ngrok(ngrok_path):
if Path(ngrok_path).exists():
return
system = platform.system()
if system == "Darwin":
url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-darwin-amd64.zip"
elif system == "Windows":
url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-windows-amd64.zip"
elif system == "Linux":
url = "https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip"
else:
raise Exception(f"{system} is not supported")
download_path = _download_file(url)
with zipfile.ZipFile(download_path, "r") as zip_ref:
zip_ref.extractall(ngrok_path)
def _download_file(url):
local_filename = url.split('/')[-1]
r = requests.get(url, stream=True)
download_path = str(Path(tempfile.gettempdir(), local_filename))
with open(download_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return download_path
def start_ngrok():
global ngrok_address
ngrok_address = _run_ngrok()
print(f" * Running on {ngrok_address}")
print(f" * Traffic stats available on http://127.0.0.1:4040")
def run_with_ngrok(app):
"""
The provided Flask app will be securely exposed to the public internet via ngrok when run,
and the its ngrok address will be printed to stdout
:param app: a Flask application object
:return: None
"""
old_run = app.run
def new_run():
thread = Timer(1, start_ngrok)
thread.setDaemon(True)
thread.start()
old_run()
app.run = new_run
####################
dont import flask_ngrok
at the end at before name == 'main' add this function
def ngrok_url():
global tunnel_url
while True:
try:
print(ngrok_address)
except Exception as e:
print(e)
and after before app.run() put
thread = Timer(1, ngrok_url)
thread.setDaemon(True)
thread.start()
and run Warning: this will crash your code editor/ or terminal if u dont want that in the ngrok url function replace print with whatever you want to do with the url
and you dont need that
global tunnel_url
def ngrok_url():
while True:
try:
print(ngrok_address)
except Exception as e:
print(e)
you can delete the threading part before the name == 'main' too after the imports set
ngrok_address = ''
then you can accses the ngrok_address anywhere in your code
I found out the easiest way to do this is the just copy the url when the user is visiting the site. You can do this by...
#app.before_request
def before_request():
global url
url = request.url
# url = url.replace('http://', 'https://', 1)
url = url.split('.ngrok.io')[0]
url += '.ngrok.io'
I'm currently setting up a some sensors with my raspberry pi in python. Total python newbie here.
Each sensor has it's own script to read the sensor and there is another script that drives an LCD display and displays the imported variable from the sensor script.
I've gotten so far with working scripts that run the sensors and generate output, however, I cannot seem to import the variables (temperature & pH) into the LCD display script. Also, once I have imported the variables, how do I instruct the LCD script to "refresh" the and fetch the updated variable?
Here's a trimmed down version of what I have so far, I've omitted the sensor and data logging parts of each script. For simplicity, script_display is the LCD driver, and pH_script is for pH and temp_script is for temperature.
Here's a simplified version of the scripts:
script_display.py
import sys
sys.path.insert(0, '/home/pi/Raspberry-Pi-sample-code')
import pH_script
import temp_script
from ph_script import ph_main
from temp_script import get_temp
import time
while True:
print PH.ph_main(ph_output)
print get_temp(temp)
time.sleep(1)
temp_script.py
from w1thermsensor import W1ThermSensor
import time
#Get Temperature
def get_temp():
global temp
sensor = W1ThermSensor(W1ThermSensor.THERM_SENSOR_DS18B20, "031683a0a4ff")
activate_temp = sensor.get_temperature()
temp = str(activate_temp)
return temp
#Read Temp Frequency
def read():
threading.Timer(0.5, read).start()
get_temp()
time.sleep(1)
try:
while True:
read()
get_temp()
print get_temp()
except KeyboardInterrupt:
print("Program Ended By User")
pH_script.py
def ph_main():
lots and lots of code to activate the PH probe, and other variables, but the variable ph_output returns as a string, ph_output
try:
while True:
global ph_output
dev.send_cmd("R")
lines = dev.read_lines()
for i in range(len(lines)):
print lines[i]
if lines[i][0] != '*':
print lines[i]
ph_output = str(lines[i])
return ph_output
time.sleep(delaytime)
try:
while True:
ph_main()
except KeyboardInterrupt:
print("Continuous polling stopped")
so, again, first question, how to pass the global variables back to the display script? and two, how to instruct the display script to 'refresh' the variables?
the error I am currently getting is:
Traceback (most recent call last):
File "script_display.py", line 8, in <module>
print PH.ph_main(ph_output)
NameError: name 'ph_output' is not defined
looking forward to any input and thanks for your time + help!
I have some links in a database which I want to download parallely. I tried doing it serially but it took too much time. I have around 1877 links.
I tried this code for running the downloads parallely but it throws an error: failed: 'tuple' object has no attribute 'read'
#!/usr/bin/env python
import urllib
from stream import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def retrieve(urls):
for url in urls:
print url,' '
res = urllib.urlretrieve(url).read()
yield url, res
if __name__ == '__main__':
retrieved = URLs >> ThreadPool(retrieve, poolsize=7)
for url, content in retrieved:
print '%r is %d bytes' % (url, len(content))
for url, exception in retrieved.failure:
print '%r failed: %s' % (url, exception)
I tried this as well:
import urllib
import tldextract
from multiprocessing.pool import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def dwld(url):
print url
res = urllib.urlopen(url).read()
filename = tldextract.extract(url)
with open(filename.domain, 'wb') as fh:
fh.write(res)
return url
pool = ThreadPool(processes = 4)
pool.map(dwld, URLs)
Gives me
Traceback (most recent call last):
File "dwld_thread.py", line 26, in
pool.map(dwld, URLs)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 148, in map
return self.map_async(func, iterable, chunksize).get()
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 422, in get
raise self._value
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known
I have no idea what that stream.ThreadPool is that you're using, or what its API is… but the problem is obvious:
res = urllib.urlretrieve(url).read()
If you look at the doc for urlretrieve:
Return a tuple (filename, headers) where filename is the local file name under which the object can be found…
You obviously can't call read on that. If you want to download to a local file, using this legacy API, and then read that file, you can:
filename, headers = urllib.urlretrieve(url)
with open(filename) as f:
res = f.read()
But why? Just use urllib2.urlopen, which "returns a file-like object with two additional methods", so you can just call read on it, and you won't be creating a temporary file, and you're not using an old function that wasn't quite designed right that nobody has maintained in years.
But Python has a nice ThreadPoolExecutor built into the standard library. And if you look at the very first example they show you, it's exactly what you're trying to do.
Unfortunately, you're using Python 2.x, which doesn't have the concurrent.futures module. Fortunately, there is a backport on PyPI that works with 2.5+.
Python also has multiprocessing.dummy.Pool (also available under the undocumented, but probably more readable, name multiprocessing.ThreadPool). But if you're willing to go outside the stdlib for some module that you apparently aren't sure how to use and that I've never heard of, I'm guessing you won't have any problem using futures. So:
import futures
import urllib2
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def load_url(url):
return urllib2.urlopen(url).read()
if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=7) as executor:
fmap = dict((executor.submit(load_url, url), url) for url in URLs)
for f in futures.as_completed(fmap):
url = fmap[f]
try:
content = f.result()
except Exception as exception:
print '%r failed: %s' % (url, exception)
else:
print '%r is %d bytes' % (url, len(content))
urllib.urlretrieve(url).read() should be urllib.urlopen(url).read()
from threading import *
from time import sleep
# if Python2:
import urllib
# if Python3:
# import urllib.request
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
class worker(Thread):
def __init__(self, link):
Thread.__init__(self)
self.link = link
self.start()
def run(self):
# if Python2:
res = urllib.urlopen(url).read() # as mentioned by #DhruvPathak
# if Python3:
# res = urllib.request.urlopen(url).read()
with open(url, 'rb') as fh:
fh.write(res) # store fetched data in a file called <link>
for url in urls:
while len(enumerate()) > 500:
sleep(0.25)
worker(url)
while len(enumerate()) > 1:
sleep(0.25) # wait for all threads to finish
What about using multiprocessing ?
Sample code:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib
from multiprocessing import Pool
import os
POOL = 8
PDFS_DOWNLOAD_DIR = 'pdfs'
PDF_LINKS = sys.argv[1]
class DownloadFiles(object):
def __init__(self):
self.pdf_links = self.read_links_from_file()
self.create_download_dir()
def create_download_dir(self):
try:
if not os.path.exists(PDFS_DOWNLOAD_DIR):
os.makedirs(PDFS_DOWNLOAD_DIR)
except IOError as e:
exit()
def read_links_from_file(self):
try:
with open(PDF_LINKS, 'r') as f:
return list(set([x.strip() for x in f]))
except (IndexError, IOError) as e:
exit()
def get_file(self, link):
filename = link.split('/')[-2]
print('Downloading file --> "{filename}"'.format(
filename=filename
))
urllib.urlretrieve(link, filename='{pdfs_data}/{filename}'.format(
pdfs_data=PDFS_DOWNLOAD_DIR,
filename=filename
))
def download(self):
pool = Pool(POOL)
pool.map(self.get_file, self.pdf_links)
pool.close()
pool.join()
print('\nSuccessfully downloaded files from given source!\n')
d = DownloadFiles()
d.download()
I'm writing a program to get the domain in same server and it also can scan the web directory.
#!/usr/bin/env python
#encoding = utf-8
import threading
import urllib,urllib2,httplib
from urllib2 import Request, urlopen, URLError
import Queue,sys
import re
concurrent = 5
url = sys.argv[1]
class Scanner(threading.Thread):
def __init__(self, work_q):
threading.Thread.__init__(self)
self.work_q = work_q
def getdomains(self):
doreq = Request('http://www.logontube.com/website/'+ url)
response = urlopen(doreq)
html = response.read()
response.close()
domains = re.findall('<br><a href=\"(.*?)\" target=\"_blank\"',html)
return domains
def run(self):
alldomains = self.getdomains()
pathline = [line.rstrip() for line in open("path.txt")]
while True:
for aim in alldomains:
for path in pathline:
path = self.work_q.get()
req = Request(aim+path)
try:
response = urlopen(req)
except URLError, e:
if hasattr(e, 'reason'):
print aim+path,'Not Found'
elif hasattr(e,'code'):
print aim+path,'Not Found'
else:
try:
logs = open('log.txt',"a+")
except(IOError):
print "[x] Failed to create log file"
print aim+path,"Found"
logs.writelines(aim+path+"\n")
logs.close()
def main():
work_q = Queue.Queue()
paths = [line.rstrip() for line in open("path.txt")]
for i in range(concurrent):
t = Scanner(work_q)
t.setDaemon(True)
t.start()
for path in paths:
work_q.put(path)
work_q.join()
main()
The problem is this program only do the loop of the path, so i only can get the scan result of one website.
I've found the problem,
for path in paths:
work_q.put(path) # The program finishes when it puts all the path
If you want to help me to test this program, you may need some directory of website(save it as path.txt)
/default.asp
/index.asp
/index.htm
/index.html
/index.jsp
/index.php
/admin.asp
/admin.php
/admin.shtml
/admin.txt
/admin_admin.asp
/config.asp
/inc/
/login.asp
/login.jsp
/login.php
/login/
/phpinfo.php
/readme.txt
/robots.txt
/test.asp
/test.html
/test.txt
/test.php
/news/readme.txt
/addmember/
You need a:
while 1:
pass
or something that waits until your threads are completed then it exits.
What is happening is that you are starting the threads but you are terminating the main thread so you never get to see the results of your threads.
I got this error when run test.py
C:\Python32>python.exe test.py
Traceback (most recent call last):
File "test.py", line 5, in <module>
import httplib
ImportError: No module named httplib
How to correct it?
Code block for test.py:
#!/usr/local/bin/python
import httplib
import sys
import re
from HTMLParser import HTMLParser
class miniHTMLParser( HTMLParser ):
viewedQueue = []
instQueue = []
def get_next_link( self ):
if self.instQueue == []:
return ''
else:
return self.instQueue.pop(0)
def gethtmlfile( self, site, page ):
try:
httpconn = httplib.HTTPConnection(site)
httpconn.request("GET", page)
resp = httpconn.getresponse()
resppage = resp.read()
except:
resppage = ""
return resppage
def handle_starttag( self, tag, attrs ):
if tag == 'a':
newstr = str(attrs[0][1])
if re.search('http', newstr) == None:
if re.search('mailto', newstr) == None:
if re.search('htm', newstr) != None:
if (newstr in self.viewedQueue) == False:
print (" adding", newstr)
self.instQueue.append( newstr )
self.viewedQueue.append( newstr )
else:
print (" ignoring", newstr)
else:
print (" ignoring", newstr)
else:
print (" ignoring", newstr)
def main():
if sys.argv[1] == '':
print ("usage is ./minispider.py site link")
sys.exit(2)
mySpider = miniHTMLParser()
link = sys.argv[2]
while link != '':
print ("\nChecking link ", link)
# Get the file from the site and link
retfile = mySpider.gethtmlfile( sys.argv[1], link )
# Feed the file into the HTML parser
mySpider.feed(retfile)
# Search the retfile here
# Get the next link in level traversal order
link = mySpider.get_next_link()
mySpider.close()
print ("\ndone\n")
if __name__ == "__main__":
main()
You are running Python 2 code on Python 3. In Python 3, the module has been renamed to http.client.
You could try to run the 2to3 tool on your code, and try to have it translated automatically. References to httplib will automatically be rewritten to use http.client instead.
you can just import http.client and rename it to httplib with this code :
import http.client as httplib
If you use PyCharm, please change you 'Project Interpreter' to '2.7.x'
I had this issue when I was trying to make my Docker container smaller. It was because I'd installed Python 2.7 with:
apt-get install -y --no-install-recommends python
And I should not have included the --no-install-recommends flag:
apt-get install -y python