I am trying to fetch the calendar of events of current month and store it into the file in the same format, such as, date, time, event and location) from my school's website https://www.umkc.edu/calendar/
I don't have much experience in HTML so I don't really know what I am dealing with, that's why I used iframe, but I don't even know if it's the right thing.
Can you help me with getting the table that has the events?
That's what I have so far:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.umkc.edu/calendar/")
soup = BeautifulSoup(html.content, "html.parser")
print(soup.find_all("iframe"))
So, data is dynamically added. You could use the same request as is performed for iframe content and use regex (or split) to get out the required javascript to then parse with hjson (due to unquoted keys); extract the html content and parse with bs4. The alternative, shown at bottom is to investigate the rss feed.
from bs4 import BeautifulSoup as bs
import requests, re, hjson
import pandas as pd
r = requests.get('https://www.trumba.com/s.aspx?calendar=UMKC&widget=main&srpc.cbid=trumba.spud.4&srpc.get=true')
p = re.compile(r'requestComplete\((.*)\);', re.DOTALL)
data = hjson.loads(re.sub('\r+|\n+|\t+','', p.findall(r.text)[0].strip()))
soup = bs(data['body'],'lxml')
p2 = re.compile(r"'(.*?)'", re.DOTALL)
results = []
for tr in soup.select('.twSimpleTableTable tr')[2:]:
date = tr.select_one('.twStartDate').text
time = tr.select_one('td:has(.twStartDate) + td, .twStartTime').text
event = p2.findall(tr.select_one('.txVMid')['title'])[0].strip()
location = tr.select_one('.twLocation')
if location is None:
location = ''
else:
location = ' '.join([string for string in location.stripped_strings])
row = [date, time, event, location]
results.append(row)
df = pd.DataFrame(results)
print(df)
Or (this needs further investigation and work):
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.trumba.com/calendars/UMKC.rss')
soup = bs(r.content, 'lxml')
df = pd.DataFrame(zip([i.text for i in soup.select('title')]
,[i.text for i in soup.select('link')]
,[re.sub('<br />|<br/>','',i.text) for i in soup.select('description')]))
print(df)
Related
I am looking to scrape some publicly available data from one of the technology/analyst research firms.
I have gotten so far that I can print out the title and position but the text.strip() function has not really worked - I am probably missing something obvious.
import requests
from bs4 import BeautifulSoup
from requests.api import head
# get the data
data = requests.get("https://www.forrester.com/bio/michele-goetz?id=BIO5224")
# load data into bs4
soup = BeautifulSoup(data.text, "html.parser")
analyst_data = soup.find("div", { "class": "col-md-9" })
#print(analyst_data)
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
print(header_title,header_paragraph)
for data in header_title.find_all(), header_paragraph.find_all():
name = data.find_all("h1")[0].text.strip()
position = data.find_all("p")[1].text.strip()
print(name , position)
You have already found a tag when doing:
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
therefore, there no point of creating this for loop:
for data in header_title.find_all(), header_paragraph.find_all():
name = data.find_all("h1")[0].text.strip()
position = data.find_all("p")[1].text.strip()
print(name , position)
instead, call .text on header_title and header_paragraph. With your example:
import requests
from bs4 import BeautifulSoup
from requests.api import head
# get the data
data = requests.get("https://www.forrester.com/bio/michele-goetz?id=BIO5224")
# load data into bs4
soup = BeautifulSoup(data.text, "html.parser")
analyst_data = soup.find("div", { "class": "col-md-9" })
#print(analyst_data)
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
print(header_title.text.strip(), header_paragraph.text.strip())
Output:
Michele Goetz VP, Principal Analyst
I'm writing a python script to extract records of all people in a site using selenium, beautifulsoup and pandas. I, however don't know how to go about that because the site is designed such that someone has to search first before getting the result. For test purposes henceforth, I'm passing a search value and manipulating the same via selenium. The issue is that after writing the script on a python shell in ipython, I get the desirable results, but the same is throwing an error in a python file when running via python command.
code
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import re
br.get(url)
content = br.page_source
soup = BeautifulSoup(content, 'lxml')
sleep(2)
sName = br.find_element_by_xpath("/html/body/div[1]/div[2]/section/div[2]/div/div/div/div/div/div/div[2]/form/div[1]/div/div/input")
sleep(3)
sName.send_keys("martin")
br.find_element_by_xpath("//*[#id='provider']/div[1]/div/div/div/button").click()
sleep(3)
table = soup.find('table')
tbody = table.find_all('tbody')
body = tbody.find_all('tr')
#
# get column heads
head = body[0]
body_rows = body[1:]
headings = []
for item in head.find_all('th'):
item = (item.text).rstrip("\n")
headings.append(item)
print(headings)
#declare an empty list for holding all records
all_rows = []
# loop through all table rows to get all table datas
for row_num in range(len(body_rows)):
row = []
for row_item in body_rows[row_num].find_all('td'):
stripA = re.sub("(\xa0)|(\n)|,","",row_item.text)
row.append(stripA)
all_rows.append(row)
# match each record to its field name
# cols = ['name', 'license', 'xxx', 'xxxx']
df = pd.DataFrame(data=all_rows, columns=headings)
You don't need the overhead of a browser or to worry about waits. You can simply mimic the post request the page makes
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
data = {'search_register': '1', 'search_text': 'Martin'}
r = requests.post('https://osp.nckenya.com/ajax/public', data=data)
soup = bs(r.content, 'lxml')
results = pd.read_html(str(soup.select_one('#datatable2')))
print(results)
I have scraped the data from this table, using Python-Beautifulsoup, from all the pages for this website and into a dictionary, as seen from the code below.
However, I am also trying to scrape for each company which has it’s own separate page,into that dictionary also.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
company_data = []
for i in range(1, 3):
page = requests.get(f'https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm{i}?')
soup = BeautifulSoup(page.text, "lxml")
row_info = soup.select('div.accordion_heading.panel-group.s_list_table')
for row_info in row_info:
comapny_info = {}
comapny_info['Name'] = row_info.select_one('div.col_1 a').text.strip()
pprint(company_data)
I have just done with only for 2M company
I believe that helps.
import requests
from bs4 import BeautifulSoup
res=requests.get("https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm").text
soup=BeautifulSoup(res,'html.parser')
comapny_info={}
comapny_info['Profile'] = soup.select('div.text-desc-members')
if len(soup.select('div.text-desc-members'))==0:
comapny_info['Profile'] = soup.select('div.list-sub')[0].text.strip()
comapny_info['ACOP']=[item['href'] for item in soup.select(".table.table-striped a.files")]
comapny_info['QuestionAnswer']=["Question:" + q.text.strip() + " Answer:" +a.text.strip() for q ,a in zip(soup.select("div.list-reports .m_question"),soup.select("div.list-reports .m_answer")) ]
print(comapny_info)
I am trying to get all the graphics card details into a csv file but not able to scrape the data(doing this as a project to scrape data for learning purposes). I am new to python and html.
I am using request and beautifulsoup libraries.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
uClient = uReq(my_url)
Negg = uClient.read()
uClient.close
Complete_Graphics_New_Egg = soup(Negg,"html.parser")
Container_Main = Complete_Graphics_New_Egg.findAll("div",{"class":"item-container"})
Container_Main5 = str(Container_Main[5])
path_file='C:\\Users\\HP\\Documents\\Python\\Container_Main5.txt'
file_1 = open(path_file,'w')
file_1.write(Container_Main5)
file_1.close()
##Container_Main_details = Container_Main5.a
#div class="item-badges"
Container_5_1 = str(Container_Main[5].findAll("ul",{"class":"item-features"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_1.txt'
file_5_1 = open(path_file,'w')
file_5_1.write(Container_5_1)
file_5_1.close()
Container_5_1.li
Container_5_2 = str(Container_Main[5].findAll("p",{"class":"item-promo"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_2.txt'
file_5_2 = open(path_file,'w')
file_5_2.write(Container_5_2)
file_5_2.close()
##p class="item-promo"
##div class="item-info"
This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it
I first initialize a results dataframe to store all the data you'll be parsing:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
Next, get the html form the site and pass that into BeautifulSoup:
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Then you had it find all the tags you were interested in. The only thing I added was have it iterate over each of those tags/elements it finds:
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
and then in each of those containers, grab the data you wanted from the item features and item promo. I store that data into a temporary dataframe (of 1 row) and then append that to my results dataframe. So after each iteration, the temp dataframe is overwritten with the new info, but the results won;t be overwritten, it'll just add on.
Lastly, use pandas to save the dataframe to csv.
results.to_csv('path/file.csv', index=False)
So full code:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)
I have to make a code in order to scrape datafrom a website and then analyse them for university.
My problem is that I made this code in order to get some data for all products but when I run it it only shows a single response for each variable.
Can you help me resolve this error ?
from bs4 import BeautifulSoup as soup
import urllib
from urllib.request import urlopen as uReq
import requests
myurl='https://boutique.orange.fr/mobile/choisir-un-mobile'
Uclient=uReq(myurl)
page=Uclient.read()
Uclient.close()
pagesoup=soup(page,'html.parser')
containers=pagesoup.findAll('div',{'class':'box-prod pointer'})
container=containers[0]
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
file="orange.csv"
f=open(file,'w')
headers='produit,prix avec abonnement, prix seul, avis\n'
f.write(headers)
for container in containers:
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
You could use different selectors. Separate two prices per product by index. Extract price specific info using join and findall.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://boutique.orange.fr/mobile/choisir-un-mobile'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
#print(len(soup.select('#resultat .box-prod.pointer')))
p = re.compile('[0-9,€]+')
altText= [item.get('alt').strip() for item in soup.select('#resultat .box-prod.pointer .lazy')]
titles = [item.text.strip().replace('\n', ' ') for item in soup.select('#resultat .box-prod.pointer .titre-produit')]
allPrices = [''.join(p.findall(item.text)) for item in soup.select('#resultat span.price')]
aPartirPrice = allPrices[0::2]
prixSeul = allPrices[1::2]
items = list(zip(titles, altText, aPartirPrice, prixSeul))
df = pd.DataFrame(items,columns=['title', 'altText', 'aPartirPrice', 'prixSeul'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
Transpose with:
df = df.T