Web Scraping tables and data with Python Beautifulsoup - python

I have scraped the data from this table, using Python-Beautifulsoup, from all the pages for this website and into a dictionary, as seen from the code below.
However, I am also trying to scrape for each company which has it’s own separate page,into that dictionary also.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
company_data = []
for i in range(1, 3):
page = requests.get(f'https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm{i}?')
soup = BeautifulSoup(page.text, "lxml")
row_info = soup.select('div.accordion_heading.panel-group.s_list_table')
for row_info in row_info:
comapny_info = {}
comapny_info['Name'] = row_info.select_one('div.col_1 a').text.strip()
pprint(company_data)

I have just done with only for 2M company
I believe that helps.
import requests
from bs4 import BeautifulSoup
res=requests.get("https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm").text
soup=BeautifulSoup(res,'html.parser')
comapny_info={}
comapny_info['Profile'] = soup.select('div.text-desc-members')
if len(soup.select('div.text-desc-members'))==0:
comapny_info['Profile'] = soup.select('div.list-sub')[0].text.strip()
comapny_info['ACOP']=[item['href'] for item in soup.select(".table.table-striped a.files")]
comapny_info['QuestionAnswer']=["Question:" + q.text.strip() + " Answer:" +a.text.strip() for q ,a in zip(soup.select("div.list-reports .m_question"),soup.select("div.list-reports .m_answer")) ]
print(comapny_info)

Related

How to not print empty line?

I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from
when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))
I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']
Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}

Python html parsing using beautiful soup issues

I am trying to get the name of all organizations from https://www.devex.com/organizations/search using beautifulsoup.However, I am getting an error. Can someone please help.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US,en;q=0.5"}
titles = []
pages = np.arange(1, 2, 1)
for page in pages:
page = requests.get("https://www.devex.com/organizations/search?page%5Bnumber%5D=" + str(page) + "", headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='info-container')
sleep(randint(2,10))
for container in movie_div:
name = container.a.find('h3', class_= 'ng-binding').text
titles.append(name)
movies = pd.DataFrame({
'movie': titles,
})
to see your dataframe
print(movies)
to see the datatypes of your columns
print(movies.dtypes)
to see where you're missing data and how much data is missing
print(movies.isnull().sum())
to move all your scraped data to a CSV file
movies.to_csv('movies.csv')
you may try with something like
name = bs.find("h3", {"class": "ng-binding"})

Web Scraping a table with BeautifulSoup

I am trying to fetch the calendar of events of current month and store it into the file in the same format, such as, date, time, event and location) from my school's website https://www.umkc.edu/calendar/
I don't have much experience in HTML so I don't really know what I am dealing with, that's why I used iframe, but I don't even know if it's the right thing.
Can you help me with getting the table that has the events?
That's what I have so far:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.umkc.edu/calendar/")
soup = BeautifulSoup(html.content, "html.parser")
print(soup.find_all("iframe"))
So, data is dynamically added. You could use the same request as is performed for iframe content and use regex (or split) to get out the required javascript to then parse with hjson (due to unquoted keys); extract the html content and parse with bs4. The alternative, shown at bottom is to investigate the rss feed.
from bs4 import BeautifulSoup as bs
import requests, re, hjson
import pandas as pd
r = requests.get('https://www.trumba.com/s.aspx?calendar=UMKC&widget=main&srpc.cbid=trumba.spud.4&srpc.get=true')
p = re.compile(r'requestComplete\((.*)\);', re.DOTALL)
data = hjson.loads(re.sub('\r+|\n+|\t+','', p.findall(r.text)[0].strip()))
soup = bs(data['body'],'lxml')
p2 = re.compile(r"'(.*?)'", re.DOTALL)
results = []
for tr in soup.select('.twSimpleTableTable tr')[2:]:
date = tr.select_one('.twStartDate').text
time = tr.select_one('td:has(.twStartDate) + td, .twStartTime').text
event = p2.findall(tr.select_one('.txVMid')['title'])[0].strip()
location = tr.select_one('.twLocation')
if location is None:
location = ''
else:
location = ' '.join([string for string in location.stripped_strings])
row = [date, time, event, location]
results.append(row)
df = pd.DataFrame(results)
print(df)
Or (this needs further investigation and work):
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.trumba.com/calendars/UMKC.rss')
soup = bs(r.content, 'lxml')
df = pd.DataFrame(zip([i.text for i in soup.select('title')]
,[i.text for i in soup.select('link')]
,[re.sub('<br />|<br/>','',i.text) for i in soup.select('description')]))
print(df)

Issues writing data scraper

I have to make a code in order to scrape datafrom a website and then analyse them for university.
My problem is that I made this code in order to get some data for all products but when I run it it only shows a single response for each variable.
Can you help me resolve this error ?
from bs4 import BeautifulSoup as soup
import urllib
from urllib.request import urlopen as uReq
import requests
myurl='https://boutique.orange.fr/mobile/choisir-un-mobile'
Uclient=uReq(myurl)
page=Uclient.read()
Uclient.close()
pagesoup=soup(page,'html.parser')
containers=pagesoup.findAll('div',{'class':'box-prod pointer'})
container=containers[0]
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
file="orange.csv"
f=open(file,'w')
headers='produit,prix avec abonnement, prix seul, avis\n'
f.write(headers)
for container in containers:
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
You could use different selectors. Separate two prices per product by index. Extract price specific info using join and findall.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://boutique.orange.fr/mobile/choisir-un-mobile'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
#print(len(soup.select('#resultat .box-prod.pointer')))
p = re.compile('[0-9,€]+')
altText= [item.get('alt').strip() for item in soup.select('#resultat .box-prod.pointer .lazy')]
titles = [item.text.strip().replace('\n', ' ') for item in soup.select('#resultat .box-prod.pointer .titre-produit')]
allPrices = [''.join(p.findall(item.text)) for item in soup.select('#resultat span.price')]
aPartirPrice = allPrices[0::2]
prixSeul = allPrices[1::2]
items = list(zip(titles, altText, aPartirPrice, prixSeul))
df = pd.DataFrame(items,columns=['title', 'altText', 'aPartirPrice', 'prixSeul'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
Transpose with:
df = df.T

Scrape Multiple URLs using Beautiful Soup

I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.
Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)
If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)

Categories

Resources