Retrieve all elements in Google Trends Data using selenium Python - python

I am trying to write a Python program to gather data from Google Trends (GT)- specifically, I want to automatically open URLs and access the specific values that are displayed in the title.
I have written the code and i am able to scrape data successfully. But i compare the data returned by code and one present in the url, the results are only partially returned.
For e.g. in the below image, the code returns the first title "Manchester United F.C. • Tottenham Hotspur F.C." But the actual website has 4 results "Manchester United F.C. • Tottenham Hotspur F.C. , International Champions Cup, Manchester
".
google trends image
screenshot output of code
We have currently tried all all possible locate elements in a page but we are still unable to fund a fix for this. We didn't want to use scrapy or beautiful soup for this
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
links=["https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"]
for link in links:
Title_temp=[]
Titile=''
seleniumDriver = r"C:/Users/Downloads/chromedriver_win32/chromedriver.exe"
chrome_options = Options()
brow = webdriver.Chrome(executable_path=seleniumDriver, chrome_options=chrome_options)
try:
brow.get(link) ## getting the url
try:
content = brow.find_elements_by_class_name("details-top")
for element in content:
Title_temp.append(element.text)
Title=' '.join(Title_temp)
except:
Title=''
brow.quit()
except Exception as error:
print error
break
Final_df = pd.DataFrame(
{'Title': Title_temp
})

From what I see, data is retrieved from an API endpoint you can call direct. I show how to call and then extract only the title (note more info is returned other than just title from API call). You can explore the breadth of what is returned (which includes article snippets, urls, image links etc) here.
import requests
import json
r = requests.get('https://trends.google.com/trends/api/realtimetrends?hl=en-GB&tz=-60&cat=s&fi=0&fs=0&geo=DE&ri=300&rs=20&sort=0')
data = json.loads(r.text[5:])
titles = [story['title'] for story in data['storySummaries']['trendingStories']]
print(titles)

Here is the code which printed all the information.
url = "https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"
driver.get(url)
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.CLASS_NAME,'details-top')))
Title_temp = []
try:
content = driver.find_elements_by_class_name("details-top")
for element in content:
Title_temp.append(element.text)
Title=' '.join(Title_temp)
except:
Title=''
print(Title_temp)
driver.close()
Here is the output.
['Hertha BSC • Fenerbahçe S.K. • Bundesliga • Ante Čović • Berlin', 'Eintracht Frankfurt • UEFA Europa League • Tallinn • Estonia • Frankfurt', 'FC Augsburg • Galatasaray S.K. • Martin Schmidt • Bundesliga • Stefan Reuter', 'Austria national football team • FIFA • Austria • FIFA World Rankings', 'Lechia Gdańsk • Brøndby IF • 2019–20 UEFA Europa League • Gdańsk', 'Alexander Zverev • Hamburg', 'Julian Lenz • Association of Tennis Professionals • Alexander Zverev', 'UEFA Europa League • Diego • Nairo Quintana • Tour de France']
Screenshot:

We were able to find a fix for this. We had to scrape data from inner html and then do some bit of data cleaning to get required records
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#html parser
def parse_html(content):
from bs4 import BeautifulSoup
from bs4.element import Comment
soup = BeautifulSoup(content, 'html.parser')
text_elements = soup.findAll(text=True)
tag_blacklist = ['style', 'script', 'head', 'title', 'meta', '[document]','img']
clean_text = []
for element in text_elements:
if element.parent.name in tag_blacklist or isinstance(element, Comment):
continue
else:
text_ = element.strip()
clean_text.append(text_)
result_text = " ".join(clean_text)
result_text = result_text.replace(r'[\r\n]','')
tag_remove_pattern = re.compile(r'<[^>]+>')
result_text = tag_remove_pattern.sub('', result_text)
result_text = re.sub(r'\\','',result_text)
return result_text
seleniumDriver = r"./chromedriver.exe"
chrome_options = Options()
brow = webdriver.Chrome(executable_path=seleniumDriver, chrome_options=chrome_options)
links=["https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"]
title_temp = []
for link in links:
try:
brow.get(link)
try:
elements = brow.find_elements_by_class_name('details-top')
for element in elements:
html_text = parse_html(element.get_attribute("innerHTML"))
title_temp.append(html_text.replace('share','').strip())
except Exception as error:
print(error)
time.sleep(1)
brow.quit()
except Exception as error:
print(error)
break
Final_df = pd.DataFrame(
{'Title': title_temp
})
print(Final_df)

Related

neither find_all nor find works

I am trying to scrape the name of every favorites on the page of a user of our choice. but with this code I get the error "ResultSet object has no attribute 'find_all'" but if I try to use find it get the opposite error and it ask me to use find_all. I'm a beginner and I don't know what to do. (also to test the code you can use the username "Kineta" she's an administrator so anyone can get access to her profile page).
thanks for your help
from bs4 import BeautifulSoup
import requests
usr_name = str(input('the user you are searching for '))
html_text = requests.get('https://myanimelist.net/profile/'+usr_name)
soup = BeautifulSoup(html_text.text, 'lxml')
favs = soup.find_all('div', class_='fav-slide-outer')
favs_title = favs.find_all('span', class_='title fs10')
print(favs_title)
Your program throws exception because you are trying to use .find_all on ResultSet (favs_title = favs.find_all(...), ResultSetdoesn't have function.find_all`). Instead, you can use CSS selector and select all required elements directly:
import requests
from bs4 import BeautifulSoup
url = "https://myanimelist.net/profile/Kineta"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for t in soup.select(".fav-slide .title"):
print(t.text)
Prints:
Kono Oto Tomare!
Yuukoku no Moriarty
Kaze ga Tsuyoku Fuiteiru
ACCA: 13-ku Kansatsu-ka
Fukigen na Mononokean
Kakuriyo no Yadomeshi
Shirokuma Cafe
Fruits Basket
Akatsuki no Yona
Colette wa Shinu Koto ni Shita
Okobore Hime to Entaku no Kishi
Meteor Methuselah
Inu x Boku SS
Vampire Juujikai
Mirako, Yuuta
Forger, Loid
Osaki, Kaname
Miyazumi, Tatsuru
Takaoka, Tetsuki
Okamoto, Souma
Shirota, Tsukasa
Archiviste, Noé
Fang, Li Ren
Fukuroi, Michiru
Sakurayashiki, Kaoru
James Moriarty, Albert
Souma, Kyou
Hades
Yona
Son, Hak
Mashima, Taichi
Ootomo, Jin
Collabel, Yuca
Masuda, Toshiki
Furukawa, Makoto
Satou, Takuya
Midorikawa, Hikaru
Miki, Shinichiro
Hino, Satoshi
Hosoya, Yoshimasa
Kimura, Ryouhei
Ono, Daisuke
KENN
Yoshino, Hiroyuki
Toriumi, Kousuke
Toyonaga, Toshiyuki
Ooishi, Masayoshi
Shirodaira, Kyou
Hakusensha
EDIT: To get Anime/Manga/Character favorites:
import requests
from bs4 import BeautifulSoup
url = "https://myanimelist.net/profile/Kineta"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
anime_favorites = [t.text for t in soup.select("#anime_favorites .title")]
manga_favorites = [t.text for t in soup.select("#manga_favorites .title")]
char_favorites = [t.text for t in soup.select("#character_favorites .title")]
print("Anime Favorites")
print("-" * 80)
print(*anime_favorites, sep="\n")
print()
print("Manga Favorites")
print("-" * 80)
print(*manga_favorites, sep="\n")
print()
print("Character Favorites")
print("-" * 80)
print(*char_favorites, sep="\n")
Prints:
Anime Favorites
--------------------------------------------------------------------------------
Kono Oto Tomare!
Yuukoku no Moriarty
Kaze ga Tsuyoku Fuiteiru
ACCA: 13-ku Kansatsu-ka
Fukigen na Mononokean
Kakuriyo no Yadomeshi
Shirokuma Cafe
Manga Favorites
--------------------------------------------------------------------------------
Fruits Basket
Akatsuki no Yona
Colette wa Shinu Koto ni Shita
Okobore Hime to Entaku no Kishi
Meteor Methuselah
Inu x Boku SS
Vampire Juujikai
Character Favorites
--------------------------------------------------------------------------------
Mirako, Yuuta
Forger, Loid
Osaki, Kaname
Miyazumi, Tatsuru
Takaoka, Tetsuki
Okamoto, Souma
Shirota, Tsukasa
Archiviste, Noé
Fang, Li Ren
Fukuroi, Michiru
Sakurayashiki, Kaoru
James Moriarty, Albert
Souma, Kyou
Hades
Yona
Son, Hak
Mashima, Taichi
Ootomo, Jin
Collabel, Yuca
find and find_all work you just need to use them correctly. You can't use them to search through lists (like the 'favs' variable in your example). You can always iterate through the lists with for loop and use the 'find' or 'find_all' functions.
I preferred making it a bit easier but you can choose the way you prefer as I am not sure if mine is more efficient:
from bs4 import BeautifulSoup
import requests
usr_name = str(input('the user you are searching for '))
html_text = requests.get('https://myanimelist.net/profile/'+usr_name)
soup = BeautifulSoup(html_text.text, 'lxml')
favs = soup.find_all('div', class_='fav-slide-outer')
for fav in favs:
tag=fav.span
print(tag.text)
If you need more info on how to use bs4 functions correctly i suggest looking through their docks here.
I looked at the page a bit and changed to code a bit, this way you should get all the results you need:
from bs4 import BeautifulSoup
import requests
usr_name = str(input('the user you are searching for '))
html_text = requests.get('https://myanimelist.net/profile/'+usr_name)
soup = BeautifulSoup(html_text.text, 'lxml')
favs = soup.find_all('li', class_='btn-fav')
for fav in favs:
tag=fav.span
print(tag.text)
I think the problem here is not really the code but how you searched your results and how the site is structured.

BeautifulSoup 4 - Web scraping soccer matches for 'today'

I'm very new to python and trying to webscrape soccer matches for 'today' from the fox sports website: https://www.foxsports.com/scores/soccer. Unfortunately, I keep running into issues with
'AttributeError: 'NoneType' object has no attribute 'find_all''
and can't seem to get the teams for that day. This is what I have so far:
import bs4
import requests
res = requests.get('foxsports.com/scores/soccer')
soup = bs4.BeautifulSoup(res.text, 'html.parser')
results = soup.find("div", class_="scores-date")
games = results.find("div", class_="scores")
print(games)
What happens?
Content is not static it is served dynamically by website, so request won't get the information you can see in your dev tools.
How to fix?
Use an api provided or selenium that handels content like a browser and can provide the page_source you are looking for.
Cause not all of the content is provided directly, you have to use selenium waits to locate the presence of the <span> with class "title-text".
Example
Note Example uses selenium 4, so check your version, update or adapt requiered dependencies to a lower version by yourself
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
service = ChromeService(executable_path='ENTER YOUR PATH TO CHROMEDRIVER')
driver = webdriver.Chrome(service=service)
driver.get('https://www.foxsports.com/scores/soccer')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//span[contains(#class, "title-text") and text() = "Today"]')))
soup = BeautifulSoup(driver.page_source, 'lxml')
for g in soup.select('.scores-date:not(:has(div)) + div .score-chip-content'):
print(list(g.stripped_strings))
Output
['SERIE A', 'JUVENTUS', '9-4-5', 'JUV', '9-4-5', 'CAGLIARI', '1-7-10', 'CAG', '1-7-10', '8:45PM', 'Paramount+', 'JUV -455', 'CAG +1100']
['LG CUP', 'ARSENAL', '0-0-0', 'ARS', '0-0-0', 'SUNDERLAND', '0-0-0', 'SUN', '0-0-0', '8:45PM', 'ARS -454', 'SUN +1243']
['LA LIGA', 'SEVILLA', '11-4-2', 'SEV', '11-4-2', 'BARCELONA', '7-6-4', 'BAR', '7-6-4', '9:30PM', 'ESPN+', 'SEV +155', 'BAR +180']
You have to provide a link with the http protocol. This code works:
import bs4
import requests
res = requests.get('https://foxsports.com/scores/soccer')
soup = bs4.BeautifulSoup(res.text, 'html.parser')
results = soup.find("div", class_="scores-date")
games = results.find("div", class_="scores")
print(results)
print(games)
However, games is None because bs4 cannot find any div with class scores in results
Far more efficient if you go through the api. All the data is there, including much more (but I only pulled out the scores to print out). You'll have to first access the site to grab the apikey to be used as a parameter.
I've also added the option to choose your group/league. So you'll nee to pip install choice
import requests
import datetime
from bs4 import BeautifulSoup
import re
#pip install choice
import choice
# Get the apikey
url = 'https://www.foxsports.com/scores/soccer'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
apikey = soup.find_all('div', {'data-scoreboard':re.compile("^https://")})[0]['data-scoreboard'].split('apikey=')[-1]
# Get the group Ids and correpsonding titles
url = 'https://api.foxsports.com/bifrost/v1/soccer/scoreboard/main'
payload = {'apikey':apikey}
jsonData = requests.get(url, params=payload).json()
groupsTitle_list = [ x['title'] for x in jsonData['groupList']]
groupsId_list = [ x['id'] for x in jsonData['groupList']]
groups_dict = dict(zip(groupsTitle_list, groupsId_list))
user_input = choice.Menu(groups_dict.keys()).ask()
groupId = groups_dict[user_input]
# Get the date of the score you are after
date_param = input('Enter date in YYYYMMDD format\nEx: 20220109\n-> ')
# If you prefer to always just grab todays score, use line below
#date_param = datetime.datetime.now().strftime("%Y%m%d")
# Pull the score for the date and group
url = f'https://api.foxsports.com/bifrost/v1/soccer/scoreboard/segment/c{groupId}d{date_param}'
payload = {
'apikey':apikey,
'groupId':groupId}
jsonData = requests.get(url, params=payload).json()
if len(jsonData['sectionList']) == 0:
print(f'No score available on {date_param} for {user_input}')
else:
returnDate = jsonData['sectionList'][0]['menuTitle']
print(f'\n {returnDate} - {user_input}')
events = jsonData['sectionList'][0]['events']
for event in events:
lowerTeamName = event['lowerTeam']['longName']
lowerTeamScore = event['lowerTeam']['score']
upperTeamName = event['upperTeam']['longName']
upperTeamScore = event['upperTeam']['score']
print(f'\t{upperTeamName} {upperTeamScore}')
print(f'\t{lowerTeamName} {lowerTeamScore}\n')
Output:
Make a choice:
0: FEATURED MATCHES
1: ENGLISH PREMIER LEAGUE
2: MLS
3: LA LIGA
4: LIGUE 1
5: BUNDESLIGA
6: UEFA CHAMPIONS LEAGUE
7: LIGA MX
8: SERIE A
9: WCQ - CONCACAF
Enter number or name; return for next page
? 0
Enter date in YYYYMMDD format
Ex: 20220109
-> 20220109
SUN, JAN 9 - FEATURED MATCHES
LIVERPOOL 4
SHREWSBURY 1
TOTTENHAM 3
MORECAMBE 1
WOLVES 3
SHEFFIELD UTD 0
WEST HAM 2
LEEDS UNITED 0
NOTTINGHAM 1
ARSENAL 0
ROMA 3
JUVENTUS 4
LYON 1
PARIS SG 1
VILLARREAL 2
ATLÉTICO MADRID 2
GUADALAJARA 3
MAZATLÁN FC 0

Unable to make my script produce a particular output

I'm trying to create a script in Python to fetch all the links connected to the name of different actors from imdb.com and then parse the first three of their movie links and finally scrape the name of director and writer of those movies. There are around 1000 names in there. I'm okay with the first three names for this example.
Website link
I can scrape the links of different actors and their first three movie links in one go.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.imdb.com/list/ls058011111/'
base = 'https://www.imdb.com/'
def get_actor_list(s):
res = s.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for name_links in soup.select(".mode-detail")[:3]:
name = name_links.select_one("h3 > a").get_text(strip=True)
item_link = urljoin(base,name_links.select_one("h3 > a").get("href"))
yield from get_movie_links(s,name,item_link)
def get_movie_links(s,name,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
item_links = [urljoin(base,item.get("href")) for item in soup.select(".filmo-category-section .filmo-row > b > a[href]")[:3]]
yield name,item_links
if __name__ == '__main__':
with requests.Session() as s:
for elem in get_actor_list(s):
print(elem)
The result I get:
('Robert De Niro', ['https://www.imdb.com/title/tt4075436/', 'https://www.imdb.com/title/tt3143812/', 'https://www.imdb.com/title/tt5537002/'])
('Jack Nicholson', ['https://www.imdb.com/title/tt1341188/', 'https://www.imdb.com/title/tt1356864/', 'https://www.imdb.com/title/tt0825232/'])
('Marlon Brando', ['https://www.imdb.com/title/tt10905860/', 'https://www.imdb.com/title/tt0442674/', 'https://www.imdb.com/title/tt1667880/'])
I can even parse the name of directors and the writers of those linked movies if I individually use those links within the following function:
def get_content(s,url):
res = s.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
director = soup.select_one("h4:contains('Director') ~ a")
director = director.get_text(strip=True) if director else None
writer = soup.select_one("h4:contains('Writer') ~ a").get_text(strip=True)
print(director,writer)
However, I would like to rectify the script merging those functions in such a way so that it produces the following (final) output:
('Robert De Niro', [Jonathan Jakubowicz, Jonathan Jakubowicz, None, Anthony Thorne, Martin Scorsese, David Grann])
('Jack Nicholson', [James L. Brooks, James L. Brooks, Casey Affleck, Casey Affleck, Rob Reiner, Justin Zackham])
('Marlon Brando', [Bob Bendetson, Bob Bendetson, Peter Mitchell, Rubin Mario Puzo, Paul Hunter, Paul Hunter])
How can I get the final output merging the above functions in the right way?

Generating URL for Yahoo news and Bing news with Python and BeautifulSoup

I want to scrape data from Yahoo News and 'Bing News' pages. The data that I want to scrape are headlines or/and text below headlines (what ever It can be scraped) and dates (time) when its posted.
I have wrote a code but It does not return anything. Its the problem with my url since Im getting response 404
Can you please help me with it?
This is the code for 'Bing'
from bs4 import BeautifulSoup
import requests
term = 'usa'
url = 'http://www.bing.com/news/q?s={}'.format(term)
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
And this is for Yahoo:
term = 'usa'
url = 'http://news.search.yahoo.com/q?s={}'.format(term)
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
Please help me to generate these urls, whats the logic behind them, Im still a noob :)
Basically your urls are just wrong. The urls that you have to use are the same ones that you find in the address bar while using a regular browser. Usually most search engines and aggregators use q parameter for the search term. Most of the other parameters are usually not required (sometimes they are - eg. for specifying result page no etc..).
Bing
from bs4 import BeautifulSoup
import requests
import re
term = 'usa'
url = 'https://www.bing.com/news/search?q={}'.format(term)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for news_card in soup.find_all('div', class_="news-card-body"):
title = news_card.find('a', class_="title").text
time = news_card.find(
'span',
attrs={'aria-label': re.compile(".*ago$")}
).text
print("{} ({})".format(title, time))
Output
Jason Mohammed blitzkrieg sinks USA (17h)
USA Swimming held not liable by California jury in sexual abuse case (1d)
United States 4-1 Canada: USA secure payback in Nations League (1d)
USA always plays the Dalai Lama card in dealing with China, says Chinese Professor (1d)
...
Yahoo
from bs4 import BeautifulSoup
import requests
term = 'usa'
url = 'https://news.search.yahoo.com/search?q={}'.format(term)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for news_item in soup.find_all('div', class_='NewsArticle'):
title = news_item.find('h4').text
time = news_item.find('span', class_='fc-2nd').text
# Clean time text
time = time.replace('·', '').strip()
print("{} ({})".format(title, time))
Output
USA Baseball will return to Arizona for second Olympic qualifying chance (52 minutes ago)
Prized White Sox prospect Andrew Vaughn wraps up stint with USA Baseball (28 minutes ago)
Mexico defeats USA in extras for Olympic berth (13 hours ago)
...

Dynamic Web scraping

I am trying to scrape this page ("http://www.arohan.in/branch-locator.php") in which when I select the state and city, an address will be displayed and I have to write the state,city and address in csv/excel file. I am able to reach this till step, now I am stuck.
Here is my code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
chrome_path= r"C:\Users\IBM_ADMIN\Downloads\chromedriver_win32\chromedriver.exe"
driver =webdriver.Chrome(chrome_path)
driver.get("http://www.arohan.in/branch-locator.php")
select = Select(driver.find_element_by_name('state'))
select.select_by_visible_text('Bihar')
drop = Select(driver.find_element_by_name('branch'))
city_option = WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath("//select[#id='city1']/option[text()='Gaya']"))
city_option.click()
Is selenium necessary? looks like you can use URLs to arrive at what you want: http://www.arohan.in/branch-locator.php?state=Assam&branch=Mirza.
Get a list of the state / branch combinations then use the beautiful soup tutorial to get the info from each page.
In a slightly organized manner:
import requests
from bs4 import BeautifulSoup
link = "http://www.arohan.in/branch-locator.php?"
def get_links(session,url,payload):
session.headers["User-Agent"] = "Mozilla/5.0"
res = session.get(url,params=payload)
soup = BeautifulSoup(res.text,"lxml")
item = [item.text for item in soup.select(".address_area p")]
print(item)
if __name__ == '__main__':
for st,br in zip(['Bihar','West Bengal'],['Gaya','Kolkata']):
payload = {
'state':st ,
'branch':br
}
with requests.Session() as session:
get_links(session,link,payload)
Output:
['Branch', 'House no -10/12, Ward-18, Holding No-12, Swarajpuri Road, Near Bank of Baroda, Gaya Pin 823001(Bihar)', 'N/A', 'N/A']
['Head Office', 'PTI Building, 4th Floor, DP Block, DP-9, Salt Lake City Calcutta, 700091', '+91 33 40156000', 'contact#arohan.in']
A better approach would be to avoid using selenium. That is useful if you require the javascript processing required to render the HTML. In your case, this is not needed. The required information is already contained within the HTML.
What is needed is to first make a request to get a page containing all of the states. Then for each state, request the list of branch. Then for each state/branch combination, a URL request can be made to get the HTML containing the address. This happens to be contained in the second <li> entry following a <ul class='address_area'> entry:
from bs4 import BeautifulSoup
import requests
import csv
import time
# Get a list of available states
r = requests.get('http://www.arohan.in/branch-locator.php')
soup = BeautifulSoup(r.text, 'html.parser')
state_select = soup.find('select', id='state1')
states = [option.text for option in state_select.find_all('option')[1:]]
# Open an output CSV file
with open('branch addresses.csv', 'w', newline='', encoding='utf-8') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['State', 'Branch', 'Address'])
# For each state determine the available branches
for state in states:
r_branches = requests.post('http://www.arohan.in/Ajax/ajax_branch.php', data={'ajax_state':state})
soup = BeautifulSoup(r_branches.text, 'html.parser')
# For each branch, request a page contain the address
for option in soup.find_all('option')[1:]:
time.sleep(0.5) # Reduce server loading
branch = option.text
print("{}, {}".format(state, branch))
r_branch = requests.get('http://www.arohan.in/branch-locator.php', params={'state':state, 'branch':branch})
soup_branch = BeautifulSoup(r_branch.text, 'html.parser')
ul = soup_branch.find('ul', class_='address_area')
if ul:
address = ul.find_all('li')[1].get_text(strip=True)
row = [state, branch, address]
csv_output.writerow(row)
else:
print(soup_branch.title)
Giving you an output CSV file starting:
State,Branch,Address
West Bengal,Kolkata,"PTI Building, 4th Floor,DP Block, DP-9, Salt Lake CityCalcutta, 700091"
West Bengal,Maheshtala,"Narmada Park, Par Bangla,Baddir Bandh Bus Stop,Opp Lane Kismat Nungi Road,Maheshtala,Kolkata- 700140. (W.B)"
West Bengal,ShyamBazar,"First Floor, 6 F.b.T. Road,Ward No.-6,Kolkata-700002"
You should slow the script down using a time.sleep(0.5) to avoid too much loading on the server.
Note: [1:] is used as the first item in the drop down lists is not a branch or state, but a Select Branch entry.

Categories

Resources