Crawl google list of all search urls with python

Crawl google list of all search urls with python - python

It works fine, but I found that it only crawls the first 10 pages, and the pages after that cannot be crawled.
I run the code and I only get 97 URLs, pages from 1 to 10 can be crawled, pages after 10+ cannot be fetched,
this is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
def search(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'}
html = requests.get('https://www.google.co.kr/search?q={}&num=100&sourceid=chrome&ie=UTF-8'.format(keyword), headers = headers).text
soup = BeautifulSoup(html, 'html.parser')
result = []
for i in soup.find_all('div', {'class':'yuRUbf'}):
result.append(i.find('a', href = True) ['href'])
df = pd.DataFrame(result)
df.to_csv('D:\\products.txt', index=False, encoding='utf-8')
search('iphone')

Your search retrieves only 5 pages as my research

Related

How to extract job description (web-scrapping) from the site using python

I tried to extract job description from a job site. i got all the details except job description. I'm attaching my code and details below. From this code I got company details location and some other datas separately. Like that i need job description of the full jobs. While running appending Job_Description I didn't get any data.
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')
Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
pageid=10*pageid
website=f'https://in.indeed.com//jobs?q=software+engineer&l=Kerala&sort=date&start={pageid}'
soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
Page=list(set(Page+ [tag['href'] for tag in SubLinks]))
for job in soup.select('a[id^="job_"]'):
job_id = job["id"].split("_")[-1]
#s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
data=[]
Company_Name=[]
Location=[]
Job_Description=[]
for div_block in soup.find_all('span', class_=['companyName',],style=None):
Company_Name.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
Location.append([line.strip() for line in div_block.stripped_strings])
for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])

Since you are working on the paginated search section of indeed.com you are not going to be getting the full job description unless you select the job and go into it.
With that said, I believe what you are looking for is the job snippet which would give you the results you are looking for based on the search criteria of your code.
for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
Job_Description.append([line.strip() for line in div_block.stripped_strings])
Based on what you are looking for I think you want to actually get all of the data instead of just the snippet so I would consider doing it this way.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup
def main():
url = "https://in.indeed.com/jobs?q=software%20engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
job_card = None
for row in str(soup).split('\n'):
if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
job_card_data = json.loads(job_card)
job_list = list()
for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
job_dict = job
job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
job_list.append(job_dict)
print(json.dumps(job_list, indent=4))
if __name__ == '__main__':
main()

How to find the element with rating count on rakuten

I'm unable to find the ratings (number next to the stars) at rakuten website the pic showed below.
I try to use beautifulsoup to locate the element, but it doesn't work.
import time
import requests
!pip install beautifulsoup4
import bs4
!pip install lxml
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
products =[]
for i in range(1,2): # Iterate from page 1 to the last page
url = "https://www.rakuten.com.tw/shop/pandq/product/?l-id=tw_shop_inshop_cat&p={}".format(i)
r = requests.get(url, headers = headers)
soup = bs4.BeautifulSoup(r.text,"lxml")
Soup = soup.find_all("div",class_='b-mod-item-vertical products-grid-section')
for product in Soup:
productcount = product.find_all("div",class_='b-content')
print(productcount)

What happens?
Selection of element is not that proper, so you wont get the expected result.
How to fix?
As your Screen shot shows different things price / rating I will focus on rating.
First select all the items:
soup.select('.b-item')
Then iterate the result set and select the <a> that holds the rating:
item.select_one('.product-review')
Get rid of all the special characters:
item.select_one('.product-review').get_text(strip=True).strip('(|)')
Example
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
r = requests.get('https://www.rakuten.com.tw/shop/pandq/product/?l-id=tw_shop_inshop_cat&p=1',headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.select('.b-item'):
rating = item.select_one('.product-review').get_text(strip=True).strip('(|)') if item.select_one('.product-review') else None
print(rating)
Output
5
36
21
32
8
...

How to get a specific tr element without class or id from a html document with BeafutifulSoup?

I have got this webpage https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html
and I need to scrape the second last row from the large table.
In other words, I need to get this (Ένδικα Μέσα -) from the table.
This is my progress so far
from bs4 import BeautifulSoup as soup
import requests
import csv
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup1 = BeautifulSoup(page.content,"html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
soup3 = soup2.find('td', text = "Ένδικα Μέσα")
print(soup3)
Thank you very much
Thank you very much, it works like a charm

You near to a solution - Clean up you soups and try to get the parent of your result, this will give you the whole tr:
soup.find('td', text = "Ένδικα Μέσα").parent.get_text(strip=True)
or find_next('td) to access the text of its neighbour:
soup.find('td', text = "Ένδικα Μέσα").find_next('td').text
Example
from bs4 import BeautifulSoup
import requests
import csv
URL = 'https://www.epant.gr/apofaseis-gnomodotiseis/item/1451-apofasi-730-2021.html'
headers1 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
page = requests.get(URL, headers = headers1)
soup = BeautifulSoup(page.content,"html.parser")
row = soup.find('td', text = "Ένδικα Μέσα").parent.get_text(strip=True)
print(row)
Output
Eνδικα Μέσα -

You can use the selector for that field. There's a easy way to copy the selector for a element using the inspector of your browser and clicking the html tag that you want in copy > Copy Selector.
With beautiful soup you can use the soup.select(selector). The documentation describes this better.

BeautifulSoup solution

I need your help to get "Description" content of this URL using BeautifulSoup in Python (as shown below).
I have tried below code but it return None only!
import requests as rq
from bs4 import BeautifulSoup
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
page = rq.get(url, headers=hdr)
soup = BeautifulSoup(page.content, "html.parser")
description = soup.find('div', {'class': 'force-wrapping ng-star-inserted'})

I had tried and i saw that soup doesn't has class force-wrapping ng-star-inserted because you had taken the source of site. It is different from what you saw in dev tool, to see source of site, you can press Ctr+U. Then you can see that the description is in meta tag with name is description. So, what you need to do is find this tag and take the content. For Sample:
res = soup.find('meta', {"name":"description"})
print(res['content'])

Why is my CSS selector not working with beautifulsoup but works fine as a chrome console query?

I have a css selector that works fine when executing it in the chrome JS Console, but does not work when running it through beautifulsoup on one example, yet works on another (I'm unable to discern the difference between the two).
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
The following query works fine on both when executing it in the chrome console.
document.querySelectorAll('div.s-result-item')
Then running the two urls through beautifulsoup, this is the output I get.
url_1 (works)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_1, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup .select('div.s-result-item')
print(len(listings))
output: 53 (correct)
url_2 (does not work)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url_2, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
listings = soup.select('div.s-result-item')
print(len(listings))
output: 0 (incorrect - expected: 49)
Does anyone know what might be going on here and how I can get the css selector to work with beautifulsoup?

I think it is the html. Change the parser to 'lxml'. You can also shorten your css selector to just class and re-use connection with Session object for efficiency.
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.amazon.com/s?k=bacopa&page=1','https://www.amazon.com/s?k=acorus+calamus&page=1']
with requests.Session() as s:
for url in urls:
r = s.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
listings = soup.select('.s-result-item')
print(len(listings))

Try selenium library to download the webpage
from selenium import webdriver
from bs4 import BeautifulSoup
url_1 = 'https://www.amazon.com/s?k=bacopa&page=1'
url_2 = 'https://www.amazon.com/s?k=acorus+calamus&page=1'
#set chrome webdriver path
driver = webdriver.Chrome('/usr/bin/chromedriver')
#download webpage
driver.get(url_2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all('div',{'class':'s-result-item'})
print(len(listings))
O/P:
url_1: 50
url_2 : 48

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Crawl google list of all search urls with python - python

Your search retrieves only 5 pages as my research

Related

How to extract job description (web-scrapping) from the site using python

How to find the element with rating count on rakuten

How to get a specific tr element without class or id from a html document with BeafutifulSoup?

BeautifulSoup solution

Why is my CSS selector not working with beautifulsoup but works fine as a chrome console query?

Categories

Resources