Parsing html from google with python - python

I have problems with parsing the html source from google with python
def events():
location = sys.argv[1];
url = "https://www.google.com/search?client=firefox-b-d&q=evenementen+" + location;
event = requests.get(url=url)
print("De zoekterm is leeg, probeer het opnieuw")
soup_events = BeautifulSoup(event.text, 'html.parser')
events_google = soup_events.find_all('<a class="rl_item rl_item_base" tabindex="0" href="/search?client=firefox-b-d')
print(type(events_google))
print(len(events_google))
print(events_google)
I tried:
events_google = soup_events.find_all('div', 'BNeawe tAd8D AP7Wnd')
Which worked but when i try any other value it outputs 0
But all values that i want to print out show up when i try to print the event.text
i'm not sure what im doing wrong.
When I run the code above i get the following response:
<class 'bs4.element.ResultSet'>
0
[]
I'm actually looking to find the events shown in a google search.

ahhh, I think your issue is
events_google = soup_events.find_all('<a class="rl_item rl_item_base" tabindex="0" href="/search?client=firefox-b-d')
should be
events_google = soup_events.find_all("a",
{"class" : ["r1_item", " r1_item_base"]})

Related

How to fix this web scraping program?

Im a complete beginner with python, and I wrote this program to scrape and find closing odds of NHL games off The Score website, and put that data in a file. The program runs but for some reason only 2 games out of the about 200 I tried show up with incorrect data.
I think it is because of how I am search through divs within a div, I wrote the code that returns the data from that in a way that it only stores that last div (which conveniently is the div im looking to scrape).
Also im sure my way of writing to the file is poor for runtime, is there a better way to do this?
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(response.content, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
# Code to find div and search for div within
divs = soup.find('div',{'class': 'col-sm-4'})
for tag in divs:
# find div
target = tag.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
# find divs within target div
odds = tag.find("div", {"class","GameDetailsCard__content--2L_KF"})
# Call write_to_file -> add data scraped from web
write_to_file(matchtitle.text +" "+ odds.text)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
file = open("NHL_GAMES.txt","a")
file.write(game_data +"\n")
file.close
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
Here is one line in the text file with correct data:
Toronto Maple Leafs # New Jersey Devils on November 24, 2022 NJD -140, o/u 6.5
Here is one with incorrect data
Carolina Hurricanes # Anaheim Ducks on December 7, 2022 Justin St. Pierre, Chris Lee
Only 2/215 were wrong like this.
It looks like that certain NHL game webpages ex: Carolina does not contain a <div> section for the 'Odds', this might be due to then being OT games? Regardless best bet is to add in a clause to handle 'no odds found'. I have updated some of your code below:
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
results = []
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
target = soup.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
if "Odds" in str(tag.find("div", {"class":"GameDetailsCard__label--iBMhJ"})):
odds = str(tag.find("div", {"class":"GameDetailsCard__content--2L_KF"}).text)
else:
odds = "No Odds found!"
print(matchtitle.text + " " + odds)
results.append(matchtitle.text + " " + odds)
# Call write_to_file -> add data scraped from web
write_to_file(results)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
print("Getting game details...")
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
with open("NHL_GAMES.txt", "a") as file:
for line in game_data:
file.write(line + "\n")
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)

Web scraping - .append add whitespaces and \n to list

I have written some code that help me scrape websites. It has worked well on some sites but I am currently running into an issue.
The collectData() function collects data from a site and appends it to 'dataList'. From this dataList I can create a csv file to export the data.
The issue I am having right now is that the function appends multiple whitespances and \n characters into my list. The output look like this: (the excessive whitespaces are not shown here)
dataList = ['\n 2.500.000 ']
Does anyone what what could cause this? As I mentioned, there are some websites where the function works fine.
Thank you!
def scrape():
dataList = []
pageNr = range(0, 1)
for page in pageNr:
pageUrl = ('https://www.example.com/site:{}'.format(page))
print(pageUrl)
def getUrl(pageUrl):
openUrl = urlopen(pageUrl)
soup = BeautifulSoup(openUrl, 'lxml')
links = soup.find_all('a', class_="ellipsis")
for link in links:
linkNew = link.get('href')
linkList.append(linkNew)
#print(linkList)
return linkList
anzList = getUrl(pageUrl)
lenght = len(anzList)
print(lenght)
anzLinks = []
for i in range(lenght):
anzeigenLinks.append('https://www.example.com/ + anzList[i]')
print(anzLinks)
def collectData():
for link in anzLinks:
openAnz = urlopen(link)
soup = BeautifulSoup(openAnz, 'lxml')
try:
kaufpreisSuche = soup.find('h2')
kaufpreis = kaufpreisSuche.text
dataListe.append(kaufpreis)
print(kaufpreis)
except:
kaufpreis = None
dataListe.append(kaufpreis)

Python Link Scraper

focus_Search = raw_input("Focus Search ")
url = "https://www.google.com/search?q="
res = requests.get(url + focus_Search)
print("You Just Searched")
res_String = res.text
#Now I must get ALL the sections of code that start with "<a href" and end with "/a>"
Im trying to scrape all the links from a google search webpage. I could extract each link one at a time but I'm sure theres a better way to do it.
This creates a list of all links in the search page with some of your code, without getting into BeautifulSoup
import requests
import lxml.html
focus_Search = input("Focus Search ")
url = "https://www.google.com/search?q="
#focus_Search
res = requests.get(url + focus_Search).content
# res
dom = lxml.html.fromstring(res)
links = [x for x in dom.xpath('//a/#href')] # Borrows from cheekybastard in link below
# http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
links

Recursive function gives no output

I'm scraping all the URL of my domain with recursive function.
But it outputs nothing, without any error.
#usr/bin/python
from bs4 import BeautifulSoup
import requests
import tldextract
def scrape(url):
for links in url:
main_domain = tldextract.extract(links)
r = requests.get(links)
data = r.text
soup = BeautifulSoup(data)
for href in soup.find_all('a'):
href = href.get('href')
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == main_domain.domain :
problem.append(href)
elif not href == '#' and link_domain.tld == '':
new = 'http://www.'+ main_domain.domain + '.' + main_domain.tld + '/' + href
problem.append(new)
return len(problem)
return scrape(problem)
problem = ["http://xyzdomain.com"]
print(scrape(problem))
When I create a new list, it works, but I don't want to make a list every time for every loop.
You need to structure your code so that it meets the pattern for recursion as your current code doesn't - you also should not call variables the same name as libraries, e.g. href = href.get() because this will usually stop the library working as it becomes the variable, your code as it currently is will only ever return the len() as this return is unconditionally reached before: return scrap(problem).:
def Recursive(Factorable_problem)
if Factorable_problem is Simplest_Case:
return AnswerToSimplestCase
else:
return Rule_For_Generating_From_Simpler_Case(Recursive(Simpler_Case))
for example:
def Factorial(n):
""" Recursively Generate Factorials """
if n < 2:
return 1
else:
return n * Factorial(n-1)
Hello I've made a none recursive version of this that appears to get all the links on the same domain.
The code below I've tested using the problem included in the code. When I'd solved the problems with the recursive version the next problem was hitting the recursion depth limit so I rewrote it so it ran in an iterative fashion, the code and result below:
from bs4 import BeautifulSoup
import requests
import tldextract
def print_domain_info(d):
print "Main Domain:{0} \nSub Domain:{1} \nSuffix:{2}".format(d.domain,d.subdomain,d.suffix)
SEARCHED_URLS = []
problem = [ "http://Noelkd.neocities.org/", "http://youpi.neocities.org/"]
while problem:
# Get a link from the stack of links
link = problem.pop()
# Check we haven't been to this address before
if link in SEARCHED_URLS:
continue
# We don't want to come back here again after this point
SEARCHED_URLS.append(link)
# Try and get the website
try:
req = requests.get(link)
except:
# If its not working i don't care for it
print "borked website found: {0}".format(link)
continue
# Now we get to this point worth printing something
print "Trying to parse:{0}".format(link)
print "Status Code:{0} Thats: {1}".format(req.status_code, "A-OK" if req.status_code == 200 else "SOMTHINGS UP" )
# Get the domain info
dInfo = tldextract.extract(link)
print_domain_info(dInfo)
# I like utf-8
data = req.text.encode("utf-8")
print "Lenght Of Data Retrived:{0}".format(len(data)) # More info
soup = BeautifulSoup(data) # This was here before so i left it.
print "Found {0} link{1}".format(len(soup.find_all('a')),"s" if len(soup.find_all('a')) > 1 else "")
FOUND_THIS_ITERATION = [] # Getting the same links over and over was boring
found_links = [x for x in soup.find_all('a') if x.get('href') not in SEARCHED_URLS] # Find me all the links i don't got
for href in found_links:
href = href.get('href') # You wrote this seems to work well
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == dInfo.domain: # JUST FINDING STUFF ON SAME DOMAIN RIGHT?!
if href not in FOUND_THIS_ITERATION: # I'ma check you out next time
print "Check out this link: {0}".format(href)
print_domain_info(link_domain)
FOUND_THIS_ITERATION.append(href)
problem.append(href)
else: # I got you already
print "DUPE LINK!"
else:
print "Not on same domain moving on"
# Count down
print "We have {0} more sites to search".format(len(problem))
if problem:
continue
else:
print "Its been fun"
print "Lets see the URLS we've visited:"
for url in SEARCHED_URLS:
print url
Which prints, after a lot of other logging loads of neocities websites!
What's happening is the script is popping a value of the list of websites yet to visit, it then gets all the links on the page which are on the same domain. If those links are to pages we haven't visited we add the link to the list of links to be visited. After we do that we pop the next page and do the same thing again until there are no pages left to visit.
Think this is what your looking for, get back to us in the comments if this doesn't work in the way that you want or if anyone can improve please leave a comment.

I need help web-scraping

So I wanted to scrape visualizations from visual.ly, however right now I do not understand how the "show more" button works. As of now, my code will get the image link, the text next to the image, and the link of the page. I was wondering how the "show more" button functions, because I was going to try to loop through using the number of pages. As of now I do not know how i would loop through each one individually. Any ideas on how I could loop through and go on to get more images than they originally show you????
from BeautifulSoup import BeautifulSoup
import urllib2
import HTMLParser
import urllib, re
counter = 1
columnno = 1
parser = HTMLParser.HTMLParser()
soup = BeautifulSoup(urllib2.urlopen('http://visual.ly/?view=explore& type=static#v2_filter').read())
image = soup.findAll("div", attrs = {'class': 'view-mode-wrapper'})
if columnno < 4:
column = image[0].findAll("div", attrs = {'class': 'v2_grid_column'})
columnno += 1
else:
column = image[0].findAll("div", attrs = {'class': 'v2_grid_column last'})
visualizations = column[0].findAll("div", attrs = {'class': '0 v2_grid_item viewmode-item'})
getImage = visualizations[0].find("a")
print counter
print getImage['href']
soup1 = BeautifulSoup(urllib2.urlopen(getImage['href']).read())
theImage = soup1.findAll("div", attrs = {'class': 'ig-graphic-wrapper'})
text = soup1.findAll("div", attrs = {'class': 'ig-content-right'})
getText = text[0].findAll("div", attrs = {'class': 'ig-description right-section first'})
imageLink = theImage[0].find("a")
print imageLink['href']
print getText
for row in image:
theImage = image[0].find("a")
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
counter += 1
You cannot use a urllib-parser combo here because it uses javascript to load more content. In order to do this you will need a full force browser emulator (with javascript support). I have never used Selenium before, but I have heard that it does this, as well as has a python binding
However, I have found that it uses a very predictable form
http://visual.ly/?page=<page_number>
for its GET requests. Perhaps an easier way would be to go under
<div class="view-mode-wrapper">...</div>
to parse the data (using the above url format). After all, ajax requests must go to a location.
Then you could do
for i in xrange(<whatever>):
url = r'http://visual.ly/?page={pagenum}'.format(pagenum=i)
#do whatever you want from here

Categories

Resources