I'm trying to soup-ify get requests
from bs4 import BeautifulSoup
import requests
import pandas as pd
html_page = requests.get('"https://www.dataquest.io"')
soup = BeautifulSoup(html_page, "lxml")
soup.find_all('<\a>')
However, this just returns an empty list
This will pull the table rows and assign each row to a dictionary, which is appended to a list. You may want to tweak the selectors slightly.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
output_data = [] # This is a LoD containing all of the table data
for i in range(1, 453): # For loop used to paginate
data_page = requests.get(f'https://www.dataquest.io?')
print(data_page)
soup = BeautifulSoup(data_page.text, "lxml")
# Find all of the table rows
elements = soup.select('div.head_table_t')
try:
secondary_elements = soup.select('div.list_table_subs')
elements = elements + secondary_elements
except:
pass
print(len(elements))
# Iterate through the rows and select individual column and assign it to the dictionary with the correct header
for element in elements:
data = {}
data['Name'] = element.select_one('div.col_1 a').text.strip()
data['Page URL'] = element.select_one('div.col_1 a')['href']
output_data.append(data) # Append dictionary (contact info) to the list
pprint(data) # Pretty Print the dictionary out (to see what you're receiving, this can be removed)
Related
Here I am trying to get the value of every column in the table shown in the picture (for three different pages) and store them in pandas dataframe. I have collected the data and now I have a list of lists, but when I try to add them to a dictionary I get empty dictionary. can anyone help me what I'm doing wrong or suggest an alternative way to create 3 dataframes, one for each table?
Here is my code:
import numpy as np
import pandas as pd
from datetime import datetime
import pytz
import requests
import json
from bs4 import BeautifulSoup
url_list = ['https://www.coingecko.com/en/coins/ethereum/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/cardano/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/chainlink/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel']
results = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
results.append(soup.find_all( "td",class_= "text-center"))
collected_data = dict()
for result in results:
for r in result:
datas = r.find_all("td", title=True)
for data in datas:
collected_data.setdefault(data.text)
collected_data
What happens?
In your first for loop your are only append the result set of soup.find_all( "td",class_= "text-center") to results.
So you wont find what you are looking for with datas = r.find_all("td", title=True)
Note also, that the column headers are not placed in <td> but in <th>.
How to fix?
You could select more specific, all <tr> in <tbody> to iterate over:
for row in soup.select('tbody tr'):
While iterating select the <th> and <td> and zip() it to dict() with the list of column headers:
data.append(
dict(zip([x.text for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
Example
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_list = ['https://www.coingecko.com/en/coins/ethereum/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/cardano/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/chainlink/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel']
data = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
for row in soup.select('tbody tr'):
data.append(
dict(zip([x.text for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
pd.DataFrame(data)
Output
Date
Market Cap
Volume
Open
Close
2021-09-05
$456,929,768,632
$24,002,848,309
$3,894.94
N/A
2021-09-04
$462,019,852,288
$30,463,347,266
$3,936.16
$3,894.94
2021-09-03
$444,936,758,975
$28,115,776,510
$3,793.30
$3,936.16
EDIT
To get a data frame per url you can change the code to the following - It will append the frames to a list, so that you can iterat over to do things.
Note This is based on your comment and if it fits, okay. I would suggest to store the coin provider also as column, so you would be able to filter, group by, ... over all providers - But that should be asked in a new question, if matters.
dfList = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
data = []
coin = url.split("/")[5].upper()
for row in soup.select('tbody tr'):
data.append(
dict(zip([f'{x.text}_{coin}' for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
# if you like to save directly as csv... change next line to -> pd.DataFrame(data).to_csv(f'{coin}.csv')
dfList.append(pd.DataFrame(data))
Output
Select data frame by list index for example dfList[0]
Date_ETHEREUM
Market Cap_ETHEREUM
Volume_ETHEREUM
Open_ETHEREUM
Close_ETHEREUM
2021-09-05
$456,929,768,632
$24,002,848,309
$3,894.94
N/A
2021-09-04
$462,019,852,288
$30,463,347,266
$3,936.16
$3,894.94
I'm new to web scraping. I'm trying to scrape data from the news site.
I have this code:
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
detik_url = "https://news.detik.com/indeks/2"
detik_url
html = requests.get(detik_url)
bsobj = soup(html.content, 'lxml')
bsobj
for link in bsobj.findAll("h3"):
print("Headline : {}".format(link.text.strip()))
links = []
for news in bsobj.findAll('article',{'class':'list-content__item'}):
links.append(news.a['href'])
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
print(p.find('p').text.strip())
How do I utilize a Pandas Dataframe to store the obtained content into a CSV file?
You can store your content in a pandas dataframe, and then write the structure to a csv file.
Suppose you want to save all the text in your p.find('p').text.strip(), along with the headline in a csv file, you can store your headline in any variable (say head):
So, from your code:
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div: # <----- Here we make the changes
print(p.find('p').text.strip())
In the line shown above, we do the following:
import pandas as pd
# Create an empty array to store all the data
generated_text = [] # create an array to store your data
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
# print statement if you want to see the output
generated_text.append(p.find('p').text.strip()) # <---- save the data in an array
# then write this into a csv file using pandas, first you need to create a
# dataframe from our list
df = pd.DataFrame(generated_text, columns = [head])
# save this into a csv file
df.to_csv('csv_name.csv', index = False)
Also, instead of the for loop, you can directly use list comprehensions and save to your CSV.
# instead of the above snippet, replace the whole `for p in div` loop by
# So from your code above:
.....
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
# Remove the whole `for p in div:` and instead use this:
df = pd.DataFrame([p.find('p').text.strip() for p in div], columns = [head])
....
df.to_csv('csv_name.csv', index = False)
Also, you can convert your array generated from list comprehension to a numpy array and directly write it to a csv file:
import numpy as np
import pandas as pd
# On a side note:
# convert your normal array to numpy array or use list comprehension to make a numpy array,
# also there are faster ways to convert a normal array to numpy array which you can explore,
# from there you can write to a csv
pd.DataFrame(nparray).to_csv('csv_name.csv'))
This question already has answers here:
What is the difference between Python's list methods append and extend?
(20 answers)
Closed 1 year ago.
So I'm working on a web scraper function to get movie data from IMDB. While I'm able to get the data I need into a dictionary, because of how I've written the function, it appends a list of lists to the dictionary. I would like to return a single list.
So right now I'm getting
dict_key: [[A,B,C,D],[E,F,G,H],...] and I want dict_key: [A,B,C,D,E,F,G,H].
Eventually, I want to then take this dictionary and convert it to a pandas dataframe with col names corresponding to the dictionary keys.
Here is my function:
It takes a list of URLs, HTML tags, and variable names and gets the movie category(s), year, and length of movie.
def web_scraper(urls, class_list, col_names):
import requests # Import necessary modules
from bs4 import BeautifulSoup
import pandas as pd
class_dict = {}
for col in col_names:
class_dict[col] = []
for url in urls:
page = requests.get(url) # Link to the page
soup = BeautifulSoup(page.content, 'html.parser') # Create a soup object
for i in range(len(class_list)): # Loop through class_list and col_names
names = soup.select(class_list[i]) # Get text
names = [name.getText(strip=True) for name in names] # append text to dataframe
class_dict[col_names[i]].append(names)
for class_ in class_dict: # Here is my attempt to flatten the list
class_ = [item for sublist in class_ for item in sublist]
return class_dict
Use list.extend instead of list.append:
def web_scraper(urls, class_list, col_names):
import requests # Import necessary modules
from bs4 import BeautifulSoup
import pandas as pd
class_dict = {}
for col in col_names:
class_dict[col] = []
for url in urls:
page = requests.get(url) # Link to the page
soup = BeautifulSoup(page.content, 'html.parser') # Create a soup object
for i in range(len(class_list)): # Loop through class_list and col_names
names = soup.select(class_list[i]) # Get text
names = [name.getText(strip=True) for name in names] # append text to dataframe
class_dict[col_names[i]].extend(names) # <--- use list.extend
return class_dict
I'm trying to get a data from this site
and then use some of it. Sorry for not copy-paste it but it's a long xml. So far I tried to get this data those ways:
from urllib.request import urlopen
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
s = urlopen(url)
content = s.read()
as print(content) looks good, now I would like to get a data from it
<tabela_rozklad data-aktualizacji="1480583567">
<DZIEN>2</DZIEN>
<GODZ>3</GODZ>
<ILOSC>2</ILOSC>
<TYG>0</TYG>
<ID_NAUCZ>66</ID_NAUCZ>
<ID_SALA>79</ID_SALA>
<ID_PRZ>104</ID_PRZ>
<RODZ>W</RODZ>
<GRUPA>1</GRUPA>
<ID_ST>13</ID_ST>
<SEM>1</SEM>
<ID_SPEC>0</ID_SPEC>
</tabela_rozklad>
How can I handle this data to easy use it?
You can use Beautiful soup and capture the tags you want. The code below should get you started!
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
# secure url content
response = requests.get(url).content
soup = BeautifulSoup(response)
# find each tabela_rozklad
tables = soup.find_all('tabela_rozklad')
# for each tabela_rozklad looks like there is 12 nested corresponding tags
tags = ['dzien', 'godz', 'ilosc', 'tyg', 'id_naucz', 'id_sala',
'id_prz', 'rodz', 'grupa', 'id_st', 'sem', 'id_spec']
# initialize empty dataframe
df = pd.DataFrame()
# iterate over each tabela_rozklad and extract each tag and append to pandas dataframe
for table in tables:
all = map(lambda x: table.find(x).text, tags)
df = df.append([all])
# insert tags as columns
df.columns = tags
# display first 5 rows of table
df.head()
# and the shape of the data
df.shape # 665 rows, 12 columns
# and now you can get to the information using traditional pandas functionality
# for instance, count observations by rodz
df.groupby('rodz').count()
# or subset only observations where rodz = J
J = df[df.rodz == 'J']
I don't know if there is such a thing - but I'm trying to do an ordered dict comprehension. However it doesn't seem to work?
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
t_data = OrderedDict()
rows = tables[1].find_all('tr')
t_data = {row.th.text: row.td.text for row in rows if row.td }
It's left as a normal dict comprehension for now (I've also left out the usual requests to soup boilerplate).
Any ideas?
You can't directly do a comprehension with an OrderedDict. You can, however, use a generator in the constructor for OrderedDict.
Try this on for size:
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
rows = tables[1].find_all('tr')
t_data = OrderedDict((row.th.text, row.td.text) for row in rows if row.td)