Python Web Scraper - Issue grabbing links from href - python

I've been following along this guide to web scraping LinkedIn and google searches. There have been some changes in the HTML of google's search results since the guide was created so I've had to tinker with the code a bit. I'm at the point where I need to grab the links from the search results but have run into an issue where the program doesn't return anything even after implementing a code fix from this post due to an error. I'm not sure what I'm doing wrong here.
import Parameters
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from parsel import Selector
import csv
# defining new variable passing two parameters
writer = csv.writer(open(Parameters.file_name, 'w'))
# writerow() method to the write to the file object
writer.writerow(['Name', 'Job Title', 'Company', 'College', 'Location', 'URL'])
# specifies the path to the chromedriver.exe
driver = webdriver.Chrome('/Users/.../Python Scripts/chromedriver')
driver.get('https://www.linkedin.com')
sleep(0.5)
# locate email form by_class_name then send_keys() to simulate key strokes
username = driver.find_element_by_id('session_key')
username.send_keys(Parameters.linkedin_username)
sleep(0.5)
password = driver.find_element_by_id('session_password')
password.send_keys(Parameters.linkedin_password)
sleep(0.5)
sign_in_button = driver.find_element_by_class_name('sign-in-form__submit-button')
sign_in_button.click()
sleep(3)
driver.get('https:www.google.com')
sleep(3)
search_query = driver.find_element_by_name('q')
search_query.send_keys(Parameters.search_query)
sleep(0.5)
search_query.send_keys(Keys.RETURN)
sleep(3)
################# HERE IS WHERE THE ISSUE LIES ######################
#linkedin_urls = driver.find_elements_by_class_name('iUh30')
linkedin_urls = driver.find_elements_by_css_selector("yuRUbf > a")
for url_prep in linkedin_urls:
url_prep.get_attribute('href')
#linkedin_urls = [url.text for url in linkedin_urls]
sleep(0.5)
print('Supposed to be URLs')
print(linkedin_urls)
The search parameter is
search_query = 'site:linkedin.com/in/ AND "python developer" AND "London"'
Results in an empty list:
Snippet of the HTML section I want to grab:
EDIT: This is the output if I go by .find_elements_by_class_name or by Sector97's 1st edits.

Found an alternative solution that might make it a bit easier to achieve what you're after. Credit to A.Pond at
https://stackoverflow.com/a/62050505
Use the google search api to get the links from the results.
You may need to install the library first
pip install google
You can then use the api to quickly extract an arbitrary number of links:
from googlesearch import search
links = []
query = 'site:linkedin.com/in AND "python developer" AND "London"'
for j in search(query, tld = 'com',start = 0,stop = 100,pause=4):
links.append(j)
I got the first 100 results but you can play around with the parameters to get more or less as you need.
You can see more about this api here:
https://www.geeksforgeeks.org/performing-google-search-using-python-code/

I think I found the error in your code.
Instead of using
linkedin_urls = driver.find_elements_by_css_selector("yuRUbf > a")
Try this instead:
web_elements = driver.find_elements_by_class_name("yuRUbf")
That gets you the parent elements. You can then extract the url text using a simple list comprehension:
linkedin_urls = [elem.find_element_by_css_selector('a').get_attribute('href') for elem in web_elements]

Related

Python Scraper Won't Complete

I am using this to code to scrape emails from google search results. However, it only scrapes the first 10 results despite having 100 search results loaded.
Ideally, I would like for it to scrape all search results.
Is there a reason for this?
from selenium import webdriver
import time
import re
import pandas as pd
PATH = 'C:\Program Files (x86)\chromedriver.exe'
l=list()
o={}
target_url = "https://www.google.com/search?q=solicitors+wales+%27email%27+%40&rlz=1C1CHBD_en-GBIT1013IT1013&sxsrf=AJOqlzWC1oRbVtWcmcIgC4-3ZnGkQ8sP_A%3A1675764565222&ei=VSPiY6WeDYyXrwStyaTwAQ&ved=0ahUKEwjlnIy9lYP9AhWMy4sKHa0kCR4Q4dUDCA8&uact=5&oq=solicitors+wales+%27email%27+%40&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQogQyBwgAEB4QogQyBQgAEKIESgQIQRgASgQIRhgAUABYAGD4AmgAcAF4AIABc4gBc5IBAzAuMZgBAKABAcABAQ&sclient=gws-wiz-serp"
driver=webdriver.Chrome(PATH)
driver.get(target_url)
email_pattern = r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,4}"
html = driver.page_source
emails = re.findall(email_pattern, html)
time.sleep(10)
df = pd.DataFrame(emails, columns=['Email Addresses'])
df.to_excel('email_addresses_.xlsx',index=False)
# print(emails)
driver.close()
The code is working as expected and scraping 10 results which is the default from google search. You can use the methods like 'find_element_by_xpath' to find the next button and click it.
This operation needs to be done till the sufficient results are collected in loop. Refer this for more details selenium locating elements
How to use the selenium commands, probably you can look upto web. I found one similar question which can provide some reference
Following up on Bijendra's answer,
you could update the code as below:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import pandas as pd
PATH = 'C:\Program Files (x86)\chromedriver.exe'
l=list()
o={}
target_url = "https://www.google.com/search?q=solicitors+wales+%27email%27+%40&rlz=1C1CHBD_en-GBIT1013IT1013&sxsrf=AJOqlzWC1oRbVtWcmcIgC4-3ZnGkQ8sP_A%3A1675764565222&ei=VSPiY6WeDYyXrwStyaTwAQ&ved=0ahUKEwjlnIy9lYP9AhWMy4sKHa0kCR4Q4dUDCA8&uact=5&oq=solicitors+wales+%27email%27+%40&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQogQyBwgAEB4QogQyBQgAEKIESgQIQRgASgQIRhgAUABYAGD4AmgAcAF4AIABc4gBc5IBAzAuMZgBAKABAcABAQ&sclient=gws-wiz-serp"
driver=webdriver.Chrome(PATH)
driver.get(target_url)
emails = []
email_pattern = r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,4}"
for i in range(2):
html = driver.page_source
for e in re.findall(email_pattern, html):
emails.append(e)
a_attr = driver.find_element(By.ID,"pnnext")
a_attr.click()
time.sleep(2)
df = pd.DataFrame(emails, columns=['Email Addresses'])
df.to_csv('email_addresses_.csv',index=False)
driver.close()
You could either change the range value passed in for loop or entirely replace the for loop with while loop so instead of
for i in range(2):
You could do:
while len(emails) < 100:
Make sure to manage the time as to when the page navigates to next page and wait for the next page to load before extracting the available emails and then moving on to clicking the next button on search result page.
Make sure to refer to docs to get a clear idea of what you should do to achieve what you want to. Happy Hacking!!
Selenium loads its own empty browser so your google settings for 100 results need to be on the code because the default is 10 results which is what your getting. You will have better luck using query parameters and adding the one for the number of results to the end of your URL
If you need further information on query parameters to achieve this its the second method described below
tldevtech.com/how-to-show-100-results-per-page-in-google-search

Python beautiful soup web scraper doesn't return tag contents

I'am trying to scrape matches and their respective odds from local bookie site but every site i try my web scraper doesn't return anything rather just prints "Process finished with exit code 0" but doesn't return anything.
Can someone help me crack open the containers and get out the contents.
i have tried all the above sites for almost a month but with no success. The problem seems to be with the exact div, class or probably span element layout.
https://www.betlion.co.ug/
https://www.betpawa.ug/
https://www.premierbet.ug/
for example i tried link 2 in the code as shown
import requests
from bs4 import BeautifulSoup
url = "https://www.betpawa.ug/"
response = requests.get (url, timeout=5)
content = BeautifulSoup (response.content, "html.parser")
for match in content.findAll("div",attrs={"class":"events-container prematch", "id":"Bp-Event-591531"}):
print (match.text.strip())
i expect the program to return a list of matches, odds and all the other components of the container. however the program runs and just prints " "Process finished with exit code 0" nothing else
it looks like the base site gets loaded in two phases
Load some HTML structure for the page,
Use JavaScript to fill in the contents
You can prove this to yourself by right clicking on the page, do "view page source" and then searching for "events-container" (it is not there).
So you'll need something more powerful than requests + bs4. I have heard of folks using Selenium to do this, but I'm not familiar with it.
You should consider using urllib3 instead of requests.
from urllib.request import Request, urlopen.
- build your req:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
- retrieve document:
res = urlopen(req)
- parse it using bs4:
html = BeautifulSoup (res, 'html.parser')
Like Chris Curvey described, the problem is that requests can't execute the JavaScript of the page. If you print your content variable you can see that the page would display a message like: "JavaScript Required! To provide you with the best possible product, our website requires JavaScript to function..." With Selenium you control an full browser in form of an WebDriver (for eample ChromeDriver binary for the Google Chrome Browser):
from bs4 import BeautifulSoup
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('headless')
driver = webdriver.Chrome(chrome_options = chrome_options)
url = "https://www.betpawa.ug/"
driver.get(url)
page = driver.page_source
content = BeautifulSoup(page, 'html.parser')
for match in content.findAll("div",attrs={"class":"events-container"}):
print (match.text.strip())
Update:
In Line 13 the command print (match.text.strip()) simply extract only the text elements for each match-div's wich has the class-attribute "events-container".
If you want to extract more specific content you can access each match over the match variable.
You need to know:
which of the avabile information you want
and how to indentify this information inside the match-div's
structure.
in which data-type you need this information
To make it easy run the program, open the developer tools of chrome with key F12, on the left top corner you see now the icon for "select an element ...",
if you click on the icon and click in the browser on the desired element you see in the area under the icon the equivalent source.
Analyse it carefully to get the info's you need, for example:
The Title of the Football match is the first h3-Tag in the match-div
and is an string
The Odd's shown are span-tag's with the class event-odds and an
number (float/double)
Search the function you need in Google or in the reference to the package you use (BeautifulSoup4).
Let's try to get it quick and dirty by using the BeautifulSoup functions on the match variable to don't get the elements of the full site (have replaced the whitespace with tabs):
# (1) lets try to find the h3-tag
title_tags = match.findAll("h3") # use on match variable
if len(title_tags) > 0: # at least one found?
title = title_tags[0].getText() # get the text of the first one
print("Title: ", title) # show it
else:
print("no h3-tags found")
exit()
# (2) lets try to get some odds as numbers in the order in which they are displayed
odds_tags = match.findAll("span", attrs={"class":"event-odds"})
if len(odds_tags) > 2: # at least three found?
odds = [] # create an list
for tag in odds_tags: # loop over the odds_tags we found
odd = tag.getText() # get the text
print("Odd: ", odd)
# good but it is an string, you can't compare it with an number in
# python and expect an good result.
# You have to clean it and convert it:
clean_odd = odd.strip() # remove empty spaces
odd = float(clean_odd) # convert it to float
print("Odd as Number:", odd)
else:
print("something wen't wrong with the odds")
exit()
input("Press enter to try it on the next match!")

How to retrieve the values of dynamic html content using Python

I'm using Python 3 and I'm trying to retrieve data from a website. However, this data is dynamically loaded and the code I have right now doesn't work:
url = eveCentralBaseURL + str(mineral)
print("URL : %s" % url);
response = request.urlopen(url)
data = str(response.read(10000))
data = data.replace("\\n", "\n")
print(data)
Where I'm trying to find a particular value, I'm finding a template instead e.g."{{formatPrice median}}" instead of "4.48".
How can I make it so that I can retrieve the value instead of the placeholder text?
Edit: This is the specific page I'm trying to extract information from. I'm trying to get the "median" value, which uses the template {{formatPrice median}}
Edit 2: I've installed and set up my program to use Selenium and BeautifulSoup.
The code I have now is:
from bs4 import BeautifulSoup
from selenium import webdriver
#...
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
print "Finding..."
for tag in soup.find_all('formatPrice median'):
print tag.text
Here is a screenshot of the program as it's executing. Unfortunately, it doesn't seem to be finding anything with "formatPrice median" specified.
Assuming you are trying to get values from a page that is rendered using javascript templates (for instance something like handlebars), then this is what you will get with any of the standard solutions (i.e. beautifulsoup or requests).
This is because the browser uses javascript to alter what it received and create new DOM elements. urllib will do the requesting part like a browser but not the template rendering part. A good description of the issues can be found here. This article discusses three main solutions:
parse the ajax JSON directly
use an offline Javascript interpreter to process the request SpiderMonkey, crowbar
use a browser automation tool splinter
This answer provides a few more suggestions for option 3, such as selenium or watir. I've used selenium for automated web testing and its pretty handy.
EDIT
From your comments it looks like it is a handlebars driven site. I'd recommend selenium and beautiful soup. This answer gives a good code example which may be useful:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('http://eve-central.com/home/quicklook.html?typeid=34')
html = driver.page_source
soup = BeautifulSoup(html)
# check out the docs for the kinds of things you can do with 'find_all'
# this (untested) snippet should find tags with a specific class ID
# see: http://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-by-css-class
for tag in soup.find_all("a", class_="my_class"):
print tag.text
Basically selenium gets the rendered HTML from your browser and then you can parse it using BeautifulSoup from the page_source property. Good luck :)
I used selenium + chrome
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = "www.sitetotarget.com"
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')`
Building off another answer. I had a similar issue. wget and curl do not work well anymore to get the content of a web page. It's particularly broken with dynamic and lazy content. Using Chrome (or Firefox or Chromium version of Edge) allows you to deal with redirects and scripting.
Below will launch an instance of Chrome, increase the timeout to 5 sec, and navigate this browser instance to a url. I ran this from Jupyter.
import time
from tqdm.notebook import trange, tqdm
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
from selenium import webdriver
driver = webdriver.Chrome('/usr/bin/chromedriver')
driver.set_page_load_timeout(5)
time.sleep(1)
driver.set_window_size(2100, 9000)
time.sleep(1)
driver.set_window_size(2100, 9000)
## You can manually adjust the browser, but don't move it after this.
## Do stuff ...
driver.quit()
Example of grabbing dynamic content and screenshots of the anchored (hence the "a" tag) HTML object, another name for hyperlink:
url = 'http://www.example.org' ## Any website
driver.get(url)
pageSource = driver.page_source
print(driver.get_window_size())
locations = []
for element in driver.find_elements_by_tag_name("a"):
location = element.location;
size = element.size;
# Collect coordinates of object: left/right, top/bottom
x1 = location['x'];
y1 = location['y'];
x2 = location['x']+size['width'];
y2 = location['y']+size['height'];
locations.append([element,x1,y1,x2,y2, x2-x1, y2-y1])
locations.sort(key = lambda x: -x[-2] - x[-1])
locations = [ (el,x1,y1,x2,y2, width,height)
for el,x1,y1,x2,y2,width,height in locations
if not (
## First, filter links that are not visible (located offscreen or zero pixels in any dimension)
x2 <= x1 or y2 <= y1 or x2<0 or y2<0
## Further restrict if you expect the objects to be around a specific size
## or width<200 or height<100
)
]
for el,x1,y1,x2,y2,width,height in tqdm(locations[:10]):
try:
print('-'*100,f'({width},{height})')
print(el.text[:100])
element_png = el.screenshot_as_png
with open('/tmp/_pageImage.png', 'wb') as f:
f.write(element_png)
img = Image.open('/tmp/_pageImage.png')
display(img)
except Exception as err:
print(err)
Installation for mac+chrome:
pip install selenium
brew cask install chromedriver
brew cask install google-chrome
I was using Mac for the original answer and Ubuntu + Windows 11 preview via WSL2 after updating. Chrome ran from Linux side with X service on Windows to render the UI.
Regarding responsibility, please respect robots.txt on each site.
I know this is an old question, but sometimes there is a better solution than using heavy selenium.
This request module for python comes with JS support (in the background it is still chromium) and you can still use beautifulsoup like normal.
Though, sometimes if you have to click elements or sth, I guess selenium is the only option.

How to store the information in a 'NoType' HTML element in Selenium - Python

I'm trying to take some information from an HTML element using Selenium - Python, and I'm unsure on how to save it. I'm kind of new to programming, but literate enough to where I know how to write code, but it's hard to research answers and adapt those to my code. I've looked on Google and can't seem to find anything that would help me specifically with what I need.
Here is the HTML element I need to get information from:
<span id="ctl00_plnMain_rptAssigmnetsByCourse_ctl00_lblOverallAverage">99.05</span>
I need to retrieve the 99.05 and store it in a variable named "avg."
Here is my code I have for the Selenium test.
username = raw_input("Username: ")
password = raw_input("Password: ")
browser = webdriver.Firefox() # Get local session of firefox
browser.get("https://hac.mckinneyisd.net/homeaccess/default.aspx") # Load page
elem = browser.find_element_by_name("ctl00$plnMain$txtLogin") # Find the query box
elem.send_keys(username)
elem = browser.find_element_by_name("ctl00$plnMain$txtPassword") # Find the password box
elem.send_keys(password + Keys.RETURN)
time.sleep(0.2) # Let the page load
elem = browser.find_element_by_link_text("Classwork").click()
time.sleep(0.2)
???????????????
browser.close()
What should I put in the ???... to take the 99.05 from the object and save it as "avg?" I have tried:
content = elem.text("td[#id='ctl00....lblOverallAverage']"
...but I get an error saying that I can't do that because it has no type.
Try:
elem = browser.find_element_by_id("ctl00_plnMain_rptAssigmnetsByCourse_ctl00_lblOverallAverage")
avg = elem.getText()

python selenium - how can I get all visible text on the page (i.e., not the page source) [duplicate]

I've been googling this all day with out finding the answer, so apologies in advance if this is already answered.
I'm trying to get all visible text from a large number of different websites. The reason is that I want to process the text to eventually categorize the websites.
After a couple of days of research, I decided that Selenium was my best chance. I've found a way to grab all the text, with Selenium, unfortunately the same text is being grabbed multiple times:
from selenium import webdriver
import codecs
filen = codecs.open('outoput.txt', encoding='utf-8', mode='w+')
driver = webdriver.Firefox()
driver.get("http://www.examplepage.com")
allelements = driver.find_elements_by_xpath("//*")
ferdigtxt = []
for i in allelements:
if i.text in ferdigtxt:
pass
else:
ferdigtxt.append(i.text)
filen.writelines(i.text)
filen.close()
driver.quit()
The if condition inside the for loop is an attempt at eliminating the problem of fetching the same text multiple times - it does not however, only work as planned on some webpages. (it also makes the script A LOT slower)
I'm guessing the reason for my problem is that - when asking for the inner text of an element - I also get the inner text of the elements nested inside the element in question.
Is there any way around this? Is there some sort of master element I grab the inner text of? Or a completely different way that would enable me to reach my goal? Any help would be greatly appreciated as I'm out of ideas for this one.
Edit: the reason I used Selenium and not Mechanize and Beautiful Soup is because I wanted JavaScript tendered text
Using lxml, you might try something like this:
import contextlib
import selenium.webdriver as webdriver
import lxml.html as LH
import lxml.html.clean as clean
url="http://www.yahoo.com"
ignore_tags=('script','noscript','style')
with contextlib.closing(webdriver.Firefox()) as browser:
browser.get(url) # Load page
content=browser.page_source
cleaner=clean.Cleaner()
content=cleaner.clean_html(content)
with open('/tmp/source.html','w') as f:
f.write(content.encode('utf-8'))
doc=LH.fromstring(content)
with open('/tmp/result.txt','w') as f:
for elt in doc.iterdescendants():
if elt.tag in ignore_tags: continue
text=elt.text or ''
tail=elt.tail or ''
words=' '.join((text,tail)).strip()
if words:
words=words.encode('utf-8')
f.write(words+'\n')
This seems to get almost all of the text on www.yahoo.com, except for text in images and some text that changes with time (done with javascript and refresh perhaps).
Here's a variation on #unutbu's answer:
#!/usr/bin/env python
import sys
from contextlib import closing
import lxml.html as html # pip install 'lxml>=2.3.1'
from lxml.html.clean import Cleaner
from selenium.webdriver import Firefox # pip install selenium
from werkzeug.contrib.cache import FileSystemCache # pip install werkzeug
cache = FileSystemCache('.cachedir', threshold=100000)
url = sys.argv[1] if len(sys.argv) > 1 else "https://stackoverflow.com/q/7947579"
# get page
page_source = cache.get(url)
if page_source is None:
# use firefox to get page with javascript generated content
with closing(Firefox()) as browser:
browser.get(url)
page_source = browser.page_source
cache.set(url, page_source, timeout=60*60*24*7) # week in seconds
# extract text
root = html.document_fromstring(page_source)
# remove flash, images, <script>,<style>, etc
Cleaner(kill_tags=['noscript'], style=True)(root) # lxml >= 2.3.1
print root.text_content() # extract text
I've separated your task in two:
get page (including elements generated by javascript)
extract text
The code is connected only through the cache. You can fetch pages in one process and extract text in another process or defer to do it later using a different algorithm.

Categories

Resources