I'm building a Twitter bot using Tweepy and BeautifulSoup4. I'd like to save in a list the results of a request but my script isn't working anymore (but it was working days ago). I've been looking at it and I don't understand. Here is my function:
import requests
import tweepy
from bs4 import BeautifulSoup
import urllib
import os
from tweepy import StreamListener
from TwitterEngine import TwitterEngine
from ConfigEngine import TwitterAPIConfig
import urllib.request
import emoji
import random
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
# Récupération des liens
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
The "url" parameter is 100% correct in the rest of the code. As an output, I get a "None". To be more precise, the execution stops right after line "results = []" (so it doesn't enter into the for).
Any idea?
Thank you so much in advance!
It seems that Google changed the HTML markup on the page. Try to change the search from class="r" to class="rc":
import requests
from bs4 import BeautifulSoup
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='rc'): # <-- change 'r' to 'rc'
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
url = 'https://www.google.com/search?q=tree'
print(parseLinks(url))
Prints:
['https://en.wikipedia.org/wiki/Tree', 'https://simple.wikipedia.org/wiki/Tree', 'https://www.britannica.com/plant/tree', 'https://www.treepeople.org/tree-benefits', 'https://books.google.sk/books?id=yNGrqIaaYvgC&pg=PA20&lpg=PA20&dq=tree&source=bl&ots=_TP8PqSDlT&sig=ACfU3U16j9xRJgr31RraX0HlQZ0ryv9rcA&hl=sk&sa=X&ved=2ahUKEwjOq8fXyKjsAhXhAWMBHToMDw4Q6AEwG3oECAcQAg', 'https://teamtrees.org/', 'https://www.woodlandtrust.org.uk/trees-woods-and-wildlife/british-trees/a-z-of-british-trees/', 'https://artsandculture.google.com/entity/tree/m07j7r?categoryId=other']
Related
I am making a program that fetches app from APKmirror
So, I fetch the page using urllib3 and scrap it using Beautiful Soup
for a in ((BeautifulSoup((urlopen(Request(url="https://www.apkmirror.com/apk/twitter-inc/", headers={'User-Agent': 'Mozilla/5.0'})).read()), 'lxml')).find_all(["a"], class_="fontBlack", text=re.compile("^.*.release*"))):
twver = ((a.string).split(' ')[1]).replace(".", "-")
break
twurl = "".join(["https://www.apkmirror.com/apk/twitter-inc/twitter/twitter-", twver, "-release/"])
twpage1= "".join(["https://apkmirror.com", ((((BeautifulSoup((urlopen(Request(url=twurl, headers={'User-Agent': 'Mozilla/5.0'})).read()), 'lxml')).find(["span"], text="APK")).parent).find(["a"], class_="accent_color")['href'])])
twpage2= "".join(["https://apkmirror.com", ((BeautifulSoup((urlopen(Request(url=twpage1, headers={'User-Agent': 'Mozilla/5.0'})).read()), 'lxml')).find(["a"], { 'class' : re.compile("accent_bg btn btn-flat downloadButton")})['href'])])
twdllink = "".join(["https://apkmirror.com", (((BeautifulSoup((urlopen(Request(url=twpage2, headers={'User-Agent': 'Mozilla/5.0'})).read()), 'lxml')).find(rel="nofollow"))['href'])])
So, I request you to please tell how to use a single conenction to apkmirror server and use it to fetch different url everytime.
You can see the url changes everytime.
Or suggest me other ways to make it fast..
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = get_soup(r.content)
links = (urljoin(url, x['href'])
for x in soup.select('a.fontBlack[href*=release]'))
for link in links:
# From here you can continue
# r = req.get(link) as you are using the same session currently
print(link)
if __name__ == "__main__":
main('https://www.apkmirror.com/apk/twitter-inc/')
I am attempting to read and parse a website that returns a JSON. Every attempt I have made, it gives me a timeout error or not an error at all(I have to stop it)
URL:
https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089
Code I have tried:
import requests
from urllib.request import Request, urlopen
#Trial 1
BASE_URL = 'https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
}
response = requests.get(BASE_URL, headers=headers)
#Trial2
url = ('https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089')
req = Request(url, headers= headers)
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
obj=json.loads(str(page_soup))
#Trial3
import dload
j = dload.json('https://api.louisvuitton.com/api/eng-us/catalog/availability/M57089')
print(j)
So far none of these attempts or any variation similar to these have been successful to open the website and read it. Any help would be appreciated.
I don't know how to web scrape that much, I wrote this code but it is running really slowly, this code is used to get the search results from a google chrome query. I want to try to add multithreading but I don't really know how. Can somebody tell me how to multithread? Also which function am I supposed to multithread?
import urllib
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
# desktop user-agent
def get_listing(url):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
html = None
links = None
r = requests.get(url, headers=headers, timeout=10)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, 'lxml')
listing_section = soup.select('#offers_table table > tbody > tr > td > h3 > a')
links = [link['href'].strip() for link in listing_section]
return links
def scrapeLinks(query_string):
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
query = query_string
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
title = g.find('h3').text
item = {
"title": title,
"link": link
}
results.append(item)
return results
def getFirst5Results(query_string):
list = scrapeLinks(query_string)
return [list[0]["link"], list[1]["link"], list[2]["link"], list[3]["link"], list[4]["link"]]
Few things about multithreading
You can use it for the code that required network calls. For
instance, invoking an api.
When the code would run for a longer
duration of time, and you want to run the process in the background.
In the case you've stated the web scraping is a long running tasks,
as it involves network call to google api and parsing of the results
after we get the results back. Assuming that you're using
scrapeLinks function for scraping.
Here's some code :
import threading
t1 = threading.Thread(target = scrapeLinks, args = (query_string,)
t1.start()
In order to retrieve results from the thread use:
t1.join()
from bs4 import BeautifulSoup
import requests
web_url = r'https://www.mlb.com/scores/2019-05-12'
get_web = requests.get(web_url).text
soup = BeautifulSoup(get_web,"html.parser")
score = soup.find_all('div',class_='container')
print(score)
I want to find this.
But result is this
Send headers to API to tell it "hey I'm a desktop browser" to get identical HTML from server side:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
request = requests.get(url, headers={'User-Agent': user_agent})
Useful links:
How to use Python requests to fake a browser visit?
Sending "User-agent" using Requests library in Python
I'm finally learning how to use class and __init__, however I'm having an issue with session. It seems like session is not carrying over to the next request. I made a simple script for testing, it adds an item, then I make another request to see if the Bag contains any value (eg. Bag(1)). The problem is that the item is adding but I'm getting Bag(0) when I make the second request. All I can think of is that there might be an issue with session on my part, but I can't figure it out. Here's the script:
import requests, re
from bs4 import BeautifulSoup
class Test():
def __init__(self):
self.s = requests.Session()
self.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
def cart(self):
headers = {'User-Agent': self.userAgent}
r = self.s.get('http://undefeated.com/store/index.php?api=1&rowid=130007&qty=1', headers=headers)
print(r.text)
if re.findall('Added', r.text):
r = self.s.get('http://undefeated.com/store/cart/pg', headers=headers).text
soup = BeautifulSoup(r, 'lxml')
bag = soup.find('li', {'class': 'leaf cart'}).text
print(bag)
start = Test().cart()