I can easily get the data when I put html = urllib.request.urlopen(req) inside a while loop, but it takes about 3 seconds to get the data. So I thought, maybe if I put that outside, I can get it faster as it won't have to open the URL everytime, but this throws up an AttributeError: 'str' object has no attribute 'read'. Maybe it doesn't recognize HTML variable name. How can I speed the processing ?
def soup():
url = "http://www.investing.com/indices/major-indices"
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Connection': 'keep-alive' }
)
global Ltp
global html
html = urllib.request.urlopen(req)
while True:
html = html.read().decode('utf-8')
bsobj = BeautifulSoup(html, "lxml")
Ltp = bsobj.find("td", {"class":"pid-169-last"} )
Ltp = (Ltp.text)
Ltp = Ltp.replace(',' , '');
os.system('cls')
Ltp = float(Ltp)
print (Ltp, datetime.datetime.now())
soup()
if you want to fetching live you need to recall url periodically
html = urllib.request.urlopen(req)
This one should be in a loop.
import os
import urllib
import datetime
from bs4 import BeautifulSoup
import time
def soup():
url = "http://www.investing.com/indices/major-indices"
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Connection': 'keep-alive' }
)
global Ltp
global html
while True:
html = urllib.request.urlopen(req)
ok = html.read().decode('utf-8')
bsobj = BeautifulSoup(ok, "lxml")
Ltp = bsobj.find("td", {"class":"pid-169-last"} )
Ltp = (Ltp.text)
Ltp = Ltp.replace(',' , '');
os.system('cls')
Ltp = float(Ltp)
print (Ltp, datetime.datetime.now())
time.sleep(3)
soup()
Result:
sh: cls: command not found
18351.61 2016-08-31 23:44:28.103531
sh: cls: command not found
18351.54 2016-08-31 23:44:36.257327
sh: cls: command not found
18351.61 2016-08-31 23:44:47.645328
sh: cls: command not found
18351.91 2016-08-31 23:44:55.618970
sh: cls: command not found
18352.67 2016-08-31 23:45:03.842745
you reassign html to equal the UTF-8 string response then keep calling it like its an IO ... this code does not fetch new data from the server on every loop, read simply reads the bytes from the IO object, it doesnt make a new request.
you can speed up the processing with the Requests library and utilise persistent connections (or urllib3 directly)
Try this (you will need to pip install requests)
import os
import datetime
from requests import Request, Session
from bs4 import BeautifulSoup
s = Session()
while True:
resp = s.get("http://www.investing.com/indices/major-indices")
bsobj = BeautifulSoup(resp.text, "html.parser")
Ltp = bsobj.find("td", {"class":"pid-169-last"} )
Ltp = (Ltp.text)
Ltp = Ltp.replace(',' , '');
os.system('cls')
Ltp = float(Ltp)
print (Ltp, datetime.datetime.now())
Related
I am attempting to read and parse a website that returns a JSON. Every attempt I have made, it gives me a timeout error or not an error at all(I have to stop it)
URL:
https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089
Code I have tried:
import requests
from urllib.request import Request, urlopen
#Trial 1
BASE_URL = 'https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
}
response = requests.get(BASE_URL, headers=headers)
#Trial2
url = ('https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089')
req = Request(url, headers= headers)
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
obj=json.loads(str(page_soup))
#Trial3
import dload
j = dload.json('https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089')
print(j)
So far none of these attempts or any variation similar to these have been successful to open the website and read it. Any help would be appreciated.
I'm building a Twitter bot using Tweepy and BeautifulSoup4. I'd like to save in a list the results of a request but my script isn't working anymore (but it was working days ago). I've been looking at it and I don't understand. Here is my function:
import requests
import tweepy
from bs4 import BeautifulSoup
import urllib
import os
from tweepy import StreamListener
from TwitterEngine import TwitterEngine
from ConfigEngine import TwitterAPIConfig
import urllib.request
import emoji
import random
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
# Récupération des liens
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
The "url" parameter is 100% correct in the rest of the code. As an output, I get a "None". To be more precise, the execution stops right after line "results = []" (so it doesn't enter into the for).
Any idea?
Thank you so much in advance!
It seems that Google changed the HTML markup on the page. Try to change the search from class="r" to class="rc":
import requests
from bs4 import BeautifulSoup
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='rc'): # <-- change 'r' to 'rc'
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
url = 'https://www.google.com/search?q=tree'
print(parseLinks(url))
Prints:
['https://en.wikipedia.org/wiki/Tree', 'https://simple.wikipedia.org/wiki/Tree', 'https://www.britannica.com/plant/tree', 'https://www.treepeople.org/tree-benefits', 'https://books.google.sk/books?id=yNGrqIaaYvgC&pg=PA20&lpg=PA20&dq=tree&source=bl&ots=_TP8PqSDlT&sig=ACfU3U16j9xRJgr31RraX0HlQZ0ryv9rcA&hl=sk&sa=X&ved=2ahUKEwjOq8fXyKjsAhXhAWMBHToMDw4Q6AEwG3oECAcQAg', 'https://teamtrees.org/', 'https://www.woodlandtrust.org.uk/trees-woods-and-wildlife/british-trees/a-z-of-british-trees/', 'https://artsandculture.google.com/entity/tree/m07j7r?categoryId=other']
So I want to scrape details from https://bookdepository.com
The problem is that it detects the country and change the prices.
I want it to be a different country.
This is my cost, I run it on real.it and I need the book depository website to think I'm from Israel.
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
bookdepo_url = 'https://www.bookdepository.com/search?search=Find+book&searchTerm=' + "0671646788".replace(' ', "+")
search_result = requests.get(bookdepo_url, headers = headers)
soup = BeautifulSoup(search_result.text, 'html.parser')
result_divs = soup.find_all("div", class_= "book-item")
You would either need to route your requests through a proxy server, a VPN, or you would need to execute your code on a machine based in Israel.
That being said, the following works (as of the time of this writing):
import pprint
from bs4 import BeautifulSoup
import requests
def make_proxy_entry(proxy_ip_port):
val = f"http://{proxy_ip_port}"
return dict(http=val, https=val)
headers = {
"User-Agent": (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
}
bookdepo_url = (
'https://www.bookdepository.com/search?search=Find+book&searchTerm='
'0671646788'
)
ip_opts = ['82.166.105.66:44081', '82.81.32.165:3128', '82.81.169.142:80',
'81.218.45.159:8080', '82.166.105.66:43926', '82.166.105.66:58774',
'31.154.189.206:8080', '31.154.189.224:8080', '31.154.189.211:8080',
'213.8.208.233:8080', '81.218.45.231:8888', '192.116.48.186:3128',
'185.138.170.204:8080', '213.151.40.43:8080', '81.218.45.141:8080']
search_result = None
for ip_port in ip_opts:
proxy_entry = make_proxy_entry(ip_port)
try:
search_result = requests.get(bookdepo_url, headers=headers,
proxies=proxy_entry)
pprint.pprint('Successfully gathered results')
break
except Exception as e:
pprint.pprint(f'Failed to connect to endpoint, with proxy {ip_port}.\n'
f'Details: {pprint.saferepr(e)}')
else:
pprint.pprint('Never made successful connection to end-point!')
search_result = None
if search_result:
soup = BeautifulSoup(search_result.text, 'html.parser')
result_divs = soup.find_all("div", class_= "book-item")
pprint.pprint(result_divs)
This solution makes use of the request library's proxies parameter. I scraped a list of proxies from one of the many free proxy-list sites: http://spys.one/free-proxy-list/IL/
The list of proxy IP addresses and ports was created using the following JavaScript snippet to scrape data off the page via my browser's Dev Tools:
console.log(
"['" +
Array.from(document.querySelectorAll('td>font.spy14'))
.map(e=>e.parentElement)
.filter(e=>e.offsetParent !== null)
.filter(e=>window.getComputedStyle(e).display !== 'none')
.filter(e=>e.innerText.match(/\s*(\d{1,3}\.){3}\d{1,3}\s*:\s*\d+\s*/))
.map(e=>e.innerText)
.join("', '") +
"']"
)
Note: Yes, that JavaScript is ugly and gross, but it got the job done.
At the end of the Python script's execution, I do see that the final currency resolves, as desired, to Israeli New Shekel (ILS), based on elements like the following in the resultant HTML:
<a ... data-currency="ILS" data-isbn="9780671646783" data-price="57.26" ...>
I don't know how to web scrape that much, I wrote this code but it is running really slowly, this code is used to get the search results from a google chrome query. I want to try to add multithreading but I don't really know how. Can somebody tell me how to multithread? Also which function am I supposed to multithread?
import urllib
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
# desktop user-agent
def get_listing(url):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
html = None
links = None
r = requests.get(url, headers=headers, timeout=10)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, 'lxml')
listing_section = soup.select('#offers_table table > tbody > tr > td > h3 > a')
links = [link['href'].strip() for link in listing_section]
return links
def scrapeLinks(query_string):
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
query = query_string
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
title = g.find('h3').text
item = {
"title": title,
"link": link
}
results.append(item)
return results
def getFirst5Results(query_string):
list = scrapeLinks(query_string)
return [list[0]["link"], list[1]["link"], list[2]["link"], list[3]["link"], list[4]["link"]]
Few things about multithreading
You can use it for the code that required network calls. For
instance, invoking an api.
When the code would run for a longer
duration of time, and you want to run the process in the background.
In the case you've stated the web scraping is a long running tasks,
as it involves network call to google api and parsing of the results
after we get the results back. Assuming that you're using
scrapeLinks function for scraping.
Here's some code :
import threading
t1 = threading.Thread(target = scrapeLinks, args = (query_string,)
t1.start()
In order to retrieve results from the thread use:
t1.join()
I'm finally learning how to use class and __init__, however I'm having an issue with session. It seems like session is not carrying over to the next request. I made a simple script for testing, it adds an item, then I make another request to see if the Bag contains any value (eg. Bag(1)). The problem is that the item is adding but I'm getting Bag(0) when I make the second request. All I can think of is that there might be an issue with session on my part, but I can't figure it out. Here's the script:
import requests, re
from bs4 import BeautifulSoup
class Test():
def __init__(self):
self.s = requests.Session()
self.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
def cart(self):
headers = {'User-Agent': self.userAgent}
r = self.s.get('http://undefeated.com/store/index.php?api=1&rowid=130007&qty=1', headers=headers)
print(r.text)
if re.findall('Added', r.text):
r = self.s.get('http://undefeated.com/store/cart/pg', headers=headers).text
soup = BeautifulSoup(r, 'lxml')
bag = soup.find('li', {'class': 'leaf cart'}).text
print(bag)
start = Test().cart()