How to extract data from dropdown list using python - python

I am trying to extract the data from drop down using python.
link:
From that,
State
Category
District
CTSO
Division
Map Type
mentions so, I want to extract data from all by selecting one by one using beautifulsoup or request library.
In my code, I gave district name statically but I want to select it one by one and extract data from it.
I tried but it doesn't work
`
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
list = ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]
url = "http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName="
for lists in list:
urls= url+lists
# print(urls)
response = requests.get(urls)
# print(response)
soup = BeautifulSoup(response.text, "html.parser")
# print(soup)
# soups= soup.find_all("div", {"id": "level_text_2"})
# print(soups)
# for ids in soup.find_all(attrs={'id': 'location_table'}):
# print(ids)
# ids = ids.text.strip()
# print(ids)
for option in soup.find_all('option'):
print(option.text)
for tag in soup.find_all(class_="panel-body"):
# print(tag.get('ctl00_ContentPlaceHolder5_ddlDistrict'))
print(tag)
`
I want:
District name
All taluka names
and all villages name. like that

Something like this
import requests
from bs4 import BeautifulSoup
for dist_name in ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]:
print('-- {} --'.format(dist_name))
r = requests.get('http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName={}'.format(dist_name))
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
select_list = soup.find_all('select')
for select in select_list:
print('Select name: ' + select.attrs['name'])
option_list = select.find_all('option')
for option in option_list:
print('\t option: ' + option.attrs['value'])

Related

Empty Dataframe when scraping specific column from website

I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.

How to get complete href links using beautifulsoup in python

I am trying to get top movies name by genre. I couldn't get complete href links for that, I stuck by getting half href links
By the following code I got,
https://www.imdb.com/search/title?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=animation&sort=user_rating,desc&title_type=feature&num_votes=25000,
https://www.imdb.com/search/title?genres=biography&sort=user_rating,desc&title_type=feature&num_votes=25000,
.........
Like that but i want to all top 100 movies name by its genre like action, Adventure, Animation, Biography.......
I tried the following code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.imdb.com'
main_url = url + '/chart/top'
res = requests.get(main_url)
soup = BeautifulSoup(res.text, 'html.parser')
for href in soup.find_all(class_='subnav_item_main'):
# print(href)
all_links = url + href.find('a').get('href')
print(all_links)
I want complete link as shown bellow from a link
/search/title?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=FM1ZEBQ7E9KGQSDD441H&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1"
You need another loop over those urls and a limit to only get 100. I store in a dictionary with keys being genre and values being a list of films. Note original titles may appear e.g. The Mountain II (2016) is Dag II (original title).
links is a list of tuples where I keep the genre as first item and url as second.
import requests, pprint
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://www.imdb.com/chart/top'
genres = {}
with requests.Session() as s:
r = s.get(url)
soup = bs(r.content, 'lxml')
links = [(i.text, urljoin(url,i['href'])) for i in soup.select('.subnav_item_main a')]
for link in links:
r = s.get(link[1])
soup = bs(r.content, 'lxml')
genres[link[0].strip()] = [i['alt'] for i in soup.select('.loadlate', limit = 100)]
pprint.pprint(genres)
Sample output:

Unable to print once to get all the data altogether

I've written a script in python to scrape the tablular content from a webpage. In the first column of the main table there are the names. Some names have links to lead another page, some are just the names without any link. My intention is to parse the rows when a name has no link to another page. However, when the name has link to another page then the script will first parse the concerning rows from the main table and then follow that link to parse associated information of that name from the table located at the bottom under the title Companies. Finally, write them in a csv file.
site link
I've tried so far:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
first_table = [i.text for i in item.select("td")]
print(first_table)
else:
first_table = [i.text for i in item.select("td")]
print(first_table)
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info = [elem.text for elem in elems.select("td")]
print(associated_info)
My above script can do almost everything but I can't create any logic to print once rather than printing thrice to get all the data atltogether so that I can write them in a csv file.
Put all your scraped data into a list, here I've called the list associated_info then all the data is in one place & you can iterate over the list to print it out to a CSV if you like...
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
associated_info = []
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
associated_info.append([i.text for i in item.select("td")])
else:
associated_info.append([i.text for i in item.select("td")])
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info.append([elem.text for elem in elems.select("td")])
print(associated_info)

How do I simultaneously scrape two pages and produce two distinct lists within one nested 'for-loop'?

I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)

Python /bs4: trying to print temperature/city from a local website

I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm
I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows
You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))
Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.

Categories

Resources