I tried to extract job description from a job site. i got all the details except job description. I'm attaching my code and details below. From this code I got company details location and some other datas separately. Like that i need job description of the full jobs. While running appending Job_Description I didn't get any data.
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')
Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
pageid=10*pageid
website=f'https://in.indeed.com//jobs?q=software+engineer&l=Kerala&sort=date&start={pageid}'
soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
Page=list(set(Page+ [tag['href'] for tag in SubLinks]))
for job in soup.select('a[id^="job_"]'):
job_id = job["id"].split("_")[-1]
#s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
data=[]
Company_Name=[]
Location=[]
Job_Description=[]
for div_block in soup.find_all('span', class_=['companyName',],style=None):
Company_Name.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
Location.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
Since you are working on the paginated search section of indeed.com you are not going to be getting the full job description unless you select the job and go into it.
With that said, I believe what you are looking for is the job snippet which would give you the results you are looking for based on the search criteria of your code.
for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
Based on what you are looking for I think you want to actually get all of the data instead of just the snippet so I would consider doing it this way.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup
def main():
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
job_card = None
for row in str(soup).split('\n'):
if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
job_card_data = json.loads(job_card)
job_list = list()
for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
job_dict = job
job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
job_list.append(job_dict)
print(json.dumps(job_list, indent=4))
if __name__ == '__main__':
main()
Related
It works fine, but I found that it only crawls the first 10 pages, and the pages after that cannot be crawled.
I run the code and I only get 97 URLs, pages from 1 to 10 can be crawled, pages after 10+ cannot be fetched,
this is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
def search(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'}
html = requests.get('https://www.google.co.kr/search?q={}&num=100&sourceid=chrome&ie=UTF-8'.format(keyword), headers = headers).text
soup = BeautifulSoup(html, 'html.parser')
result = []
for i in soup.find_all('div', {'class':'yuRUbf'}):
result.append(i.find('a', href = True) ['href'])
df = pd.DataFrame(result)
df.to_csv('D:\\products.txt', index=False, encoding='utf-8')
search('iphone')
Your search retrieves only 5 pages as my research
I have got this webpage https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html
and I need to scrape the second last row from the large table.
In other words, I need to get this (Ένδικα Μέσα -) from the table.
This is my progress so far
from bs4 import BeautifulSoup as soup
import requests
import csv
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup1 = BeautifulSoup(page.content,"html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
soup3 = soup2.find('td', text = "Ένδικα Μέσα")
print(soup3)
Thank you very much
Thank you very much, it works like a charm
You near to a solution - Clean up you soups and try to get the parent of your result, this will give you the whole tr:
soup.find('td', text = "Ένδικα Μέσα").parent.get_text(strip=True)
or find_next('td) to access the text of its neighbour:
soup.find('td', text = "Ένδικα Μέσα").find_next('td').text
Example
from bs4 import BeautifulSoup
import requests
import csv
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup = BeautifulSoup(page.content,"html.parser")
row = soup.find('td', text = "Ένδικα Μέσα").parent.get_text(strip=True)
print(row)
Output
Eνδικα Μέσα -
You can use the selector for that field. There's a easy way to copy the selector for a element using the inspector of your browser and clicking the html tag that you want in copy > Copy Selector.
With beautiful soup you can use the soup.select(selector). The documentation describes this better.
So I'm new to parsing HTML with Python, and I want to get the price of this lumber from the following link:
https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211
This is what I have so far, but I'm getting an error that says "AttributeError: 'NoneType' object has no attribute 'text'" :
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
URL = "https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-
in-Actual/1000074211"
r = requests.get(URL,headers=HEADERS)
c=r.content
soup = BeautifulSoup(c, "html.parser")
price_box = soup.find("div", class_="sq-bqyKva.ehfErk")
price=price_box.text.strip()
print(price)
Why am I getting this error and how can I fix it?
I can't see the site, but most probably you're getting the error because BS cannot find any element with a class called "sq-bqyKva.ehfErk".
Can you print out the soup and search for the class manually to see that it actually exists?
Also, based on the class name it looks like the div you are trying to find is dynamically generated using JavaScript which means it is not loaded into DOM when the request is made, which means BS won't be able to find it. If this is the case you might want to look into using other tools such as Selenium.
The site is dynamic, relying on a script to populate the page with data from a JSON string, which can be found in a script tag and parsed via the json module to access the price:
from bs4 import BeautifulSoup as soup
import json
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
d = soup(requests.get('https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211', headers=headers).text, 'html.parser')
data = json.loads(d.select_one('script[data-react-helmet="true"][type="application/ld+json"]').contents[0])
price = data[2]['offers']['price']
Output:
5.98
This is a really difficult site. You will not be able to access the price using that class name. Web scraping becomes difficult if your site has javascript DOM elements. "View Source" will get you the response that Beautifulsoup will get. On closer inspection, your page price is inside a json value under a "script" tag:
import requests
from bs4 import BeautifulSoup
import json
HEADERS = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
URL = "https://www.lowes.com/pd/2-in-x-4-in-x-8-ft-Whitewood-Stud-Common-1-5-in-x-3-5-in-x-96-in-Actual/1000074211"
r = requests.get(URL,headers=HEADERS)
soup = BeautifulSoup(r.content, "html.parser")
contents = soup.find("script")
json_object = json.loads(contents.text)
print(json_object[2]["offers"]["price"])
I need your help to get "Description" content of this URL using BeautifulSoup in Python (as shown below).
I have tried below code but it return None only!
import requests as rq
from bs4 import BeautifulSoup
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
page = rq.get(url, headers=hdr)
soup = BeautifulSoup(page.content, "html.parser")
description = soup.find('div', {'class': 'force-wrapping ng-star-inserted'})
I had tried and i saw that soup doesn't has class force-wrapping ng-star-inserted because you had taken the source of site. It is different from what you saw in dev tool, to see source of site, you can press Ctr+U. Then you can see that the description is in meta tag with name is description. So, what you need to do is find this tag and take the content. For Sample:
res = soup.find('meta', {"name":"description"})
print(res['content'])
I have a css selector that works fine when executing it in the chrome JS Console, but does not work when running it through beautifulsoup on one example, yet works on another (I'm unable to discern the difference between the two).
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
The following query works fine on both when executing it in the chrome console.
document.querySelectorAll('div.s-result-item')
Then running the two urls through beautifulsoup, this is the output I get.
url_1 (works)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_1, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup .select('div.s-result-item')
print(len(listings))
output: 53 (correct)
url_2 (does not work)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_2, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup.select('div.s-result-item')
print(len(listings))
output: 0 (incorrect - expected: 49)
Does anyone know what might be going on here and how I can get the css selector to work with beautifulsoup?
I think it is the html. Change the parser to 'lxml'. You can also shorten your css selector to just class and re-use connection with Session object for efficiency.
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.amazon.com/s?k=bacopa&page=1','https://www.amazon.com/s?k=acorus+calamus&page=1']
with requests.Session() as s:
for url in urls:
r = s.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
listings = soup.select('.s-result-item')
print(len(listings))
Try selenium library to download the webpage
from selenium import webdriver
from bs4 import BeautifulSoup
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
#set chrome webdriver path
driver = webdriver.Chrome('/usr/bin/chromedriver')
#download webpage
driver.get(url_2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all('div',{'class':'s-result-item'})
print(len(listings))
O/P:
url_1: 50
url_2 : 48