Trying to web scrape all the tables on a web page - python

import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
table_data = doc.findAll("td")
#for i in table_data:
#towns.append(table_data[i])
print(table_data)
I'm trying to get the data from the tables, like numbers of adherents to certain religions, ethnic groups, etc. When I look at the source page all that stuff is between the td tags but I'm not seeing it when I print out table_data. What am I doing wrong?

import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
dfs = pd.read_html(page)
for x in dfs:
print(x) ## do what you will with the data
For instance, the religions would be table 17 (dfs[17]):
Religion Adherents Congregations
0 Orthodox 754 6
1 Evangelical Protestant 232 3
2 Catholic 185 1
3 Other 112 1
4 Mainline Protestant 82 1
5 None 4196 -
EDIT: Given the OP's insurmountable issues with his python install, a workaround would be:
url = "https://www.city-data.com/city/Adak-Alaska.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
for x in soup.select('table'):
for z in x.select('tr'):
print([y.text.strip() for y in z.find_all(['td', 'th'])])
print('________________')
Results can be further transformed in dataframes.

Related

I get error "unmatched '}'" when I scrape website(korter.az)

I want to crawl all advertisements but output is "unmatched '}'". Is there any easy way to do it? I tried Beautifulsoup before but I think It's not correct way to do it or I'm using it wrong way.
How can I scrape all '199 yeni tikili binalar' from the website.
from ast import literal_eval
from bs4 import BeautifulSoup as bs
import requests
import re
import json
import requests
import pandas as pd
from ast import literal_eval
url = "https://korter.az/yasayis-kompleksleri-baku"
html_doc = requests.get(url).text
data = re.search(r'2804\.jpg"\}\}\}\],(".*")', html_doc).group(1)
data = json.loads(literal_eval(data))
df = pd.DataFrame(data)
df.to_excel('korter.xlsx', index=False)
The site has an api which can be accessed by request
Url of the API is : "https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
Full Code
import requests
import math
import pandas as pd
def roundup(x):
return int(math.ceil(x / 20.0)) * 20
# Gettig no of results
url1 = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
r = requests.get(url1)
no_of_outcomes = r.json()["totalBuildingsCount"]
# since the data is 199 i am rounding up to 20 since i will divide no of outcomes by 20 as the api only provides with 20 results at a time
no_of_outcomes = roundup(no_of_outcomes)
# Getting Sub Url from each Page by looping.
result_url = []
previous_subdata = []
for k in range(1, int(no_of_outcomes/20)+1):
url = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page={k}&lang=az-AZ&locale=az-AZ"
r = requests.get(url)
subdata = r.json()["buildings"]
for i in subdata:
suburl = "https://korter.az"+i["url"]
result_url.append(suburl)
print(len(result_url))
df = pd.DataFrame(result_url)
print(df)
Output
199
0
0 https://korter.az/toca-residence-baki
1 https://korter.az/malibu-residence-baki
2 https://korter.az/zirve-park-baki
3 https://korter.az/melissa-park-baki
4 https://korter.az/white-hotel-baki
.. ...
194 https://korter.az/yasham-boulevard-baki
195 https://korter.az/koroglu-baki
196 https://korter.az/luxor-palace-baki
197 https://korter.az/shirvanshahlar-residence-baki
198 https://korter.az/baki-baglari-baki
[199 rows x 1 columns]
Hope this helps. Happy Coding :)

Comparison between 2 columns in Pandas and printing only those rows that satisfy the condition

This is my code and I want to compare the goals and xG column and print only those rows which satisfy the condition goals > xG.
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
pd.set_option("max_colwidth", 99999)
pd.set_option("max_rows", 99999)
url = "https://understat.com/league/EPL"
page_connect = urlopen(url)
page_html = BeautifulSoup(page_connect, "html.parser")
raw_string = page_html.findAll(name="script")[3].text
start_ind = raw_string.index("\\")
stop_ind = raw_string.index("')")
json_data = raw_string[start_ind:stop_ind]
json_data = json_data.encode("utf8").decode("unicode_escape")
final_json_df = pd.json_normalize(json.loads(json_data))
a = final_json_df[final_json_df.shots == 0]
final_json_df = final_json_df.astype({"goals" : 'float'})
final_json_df = final_json_df.astype({"xG" : 'float'})
I tried this:
final_json_df[final_json_df.goals>xG]
but it doesn't seem to work. It would be helpful if someone can give the solution and explain why final_json_df[final_json_df.goals>xG] doesn't work.

How i can get random object from list in Python

I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.

Get data of a XML with criteria

i have the following code:
import pandas as pd
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
url = 'http://windte2001.acepta.com/v01/E67EBB4910CFDCB067EB7D85FBA6E5511D0E64A9'.replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
result = soup.find_all(['NmbItem','QtyItem'])
that brings the next result in xml
<NmbItem>SERV. MANEJO DE LIQUIDOS</NmbItem>, <QtyItem>22.00</QtyItem>, <NmbItem>SERV. MANEJO DE RESPEL</NmbItem>, <QtyItem>1.00</QtyItem>]
All i need if NmbItem contains 'LIQUIDOS' bring me the 'QtyItem' in this case is 22
How can i do this with python in this xml?
You can use regular expression.
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(new,'xml')
result=soup.find('NmbItem',text=re.compile("LIQUIDOS")).find_next('QtyItem').text
print(result)
You can do like this:
result = soup.find_all(['NmbItem'])
for item in result:
if 'LIQUIDOS' in item.text:
print(list(item.next_siblings)[3].text)

Unable to retrieve data from Macro Trends using selenium and read_html to create a data frame?

I'm want to import data from macro trends into pandas data frame. From looking at the page source of the website it appears that data is in a jqxgrid.
I have tried using pandas/beautiful soup with the read_html function and no table was found. I am currently trying to use selenium to extract the data. I was hoping that if I could move the horizontal scroll bar table jqxgrid would be loaded and able to be extracted. However, that did not work.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.maximize_window()
driver.execute_script("window.location = 'http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q';")
driver.implicitly_wait(2)
grid = driver.find_element_by_id('jqxgrid')
time.sleep(1)
driver.execute_script("window.scrollBy(0, 600);")
scrollbar = driver.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
time.sleep(1)
actions = ActionChains(driver)
time.sleep(1)
for i in range(1,6):
actions.drag_and_drop_by_offset(scrollbar,i*70,0).perform()
time.sleep(1)
pd.read_html(grid.get_attribute('outerHTML'))
Error I get is:
ValueError: No tables found
I would expect table data from "http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q" to be imported into a data frame
Here's an alternative that's quicker than selenium which has the headers as shown on page.
import requests
from bs4 import BeautifulSoup as bs
import re
import json
import pandas as pd
r = requests.get('https://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q')
p = re.compile(r' var originalData = (.*?);\r\n\r\n\r',re.DOTALL)
data = json.loads(p.findall(r.text)[0])
headers = list(data[0].keys())
headers.remove('popup_icon')
result = []
for row in data:
soup = bs(row['field_name'])
field_name = soup.select_one('a, span').text
fields = list(row.values())[2:]
fields.insert(0, field_name)
result.append(fields)
pd.option_context('display.max_rows', None, 'display.max_columns', None)
df = pd.DataFrame(result, columns = headers)
print(df.head())
The problem is the data is not in a table but 'div' elements. I'm not an expert on pandas but you can do it with BeautifulSoup.
Insert the line after your outher imports
from bs4 import BeautifulSoup
then change your last line for:
soup = BeautifulSoup(grid.get_attribute('outerHTML'), "html.parser")
divList = soup.findAll('div', {'role': 'row'})
data = [[x.text for x in div.findChildren('div', recursive=False)] for div in divList]
df = pd.DataFrame(data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
This finds all the 'div' elements with the attribute 'row'. Then reads the text elements for each div it finds under the 'div' elements with the attribute 'row' but only descends one level as some have multiple 'div' elements.
Output:
0 1 2 3 4 5 \
0 Revenue $4,135 $5,672 $3,262 $2,886
1 Cost Of Goods Sold $3,179 $4,501 $2,500 $2,185
2 Gross Profit $956 $1,171 $762 $701
3 Research And Development Expenses $234 $222 $209 $201
4 SG&A Expenses $518 $675 $427 $381
5 Other Operating Income Or Expenses $-6 $-3 $-3 $-3
...
6 7 8 9 10 11 12 13 14
0 $3,015 $3,986 $2,307 $2,139 $2,279 $2,977 $1,858 $1,753 $1,902
1 $2,296 $3,135 $1,758 $1,630 $1,732 $2,309 $1,395 $1,303 $1,444
2 $719 $851 $549 $509 $547 $668 $463 $450 $458
3 $186 $177 $172 $167 $146 $132 $121 $106 $92
4 $388 $476 $335 $292 $292 $367 $247 $238 $257
5 - $-2 $-2 $-3 $-3 $-4 $-40 $-2 $-1
...
However as you scroll across the page the items on the left are removed from the page source so that not all the data is scraped.
Updated in response to comment.
To set the column headers use:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.maximize_window()
driver.execute_script(
"window.location = 'http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q';")
driver.implicitly_wait(2)
grid = driver.find_element_by_id('wrapperjqxgrid')
time.sleep(1)
driver.execute_script("window.scrollBy(0, 600);")
scrollbar = driver.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
time.sleep(1)
actions = ActionChains(driver)
time.sleep(1)
for i in range(1, 6):
actions.drag_and_drop_by_offset(scrollbar, i * 70, 0).perform()
time.sleep(1)
soup = BeautifulSoup(grid.get_attribute('outerHTML'), "html.parser")
headersList = soup.findAll('div', {'role': 'columnheader'})
col_names=[h.text for h in headersList]
divList = soup.findAll('div', {'role': 'row'})
data = [[x.text for x in div.findChildren('div', recursive=False)] for div in divList]
df = pd.DataFrame(data, columns=col_names)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
Outputs:
Quarterly Data | Millions of US $ except per share data 2008-03-31 \
0 Revenue $4,135
1 Cost Of Goods Sold $3,179
2 Gross Profit $956
...
2007-12-31 2007-09-30 2007-06-30 2007-03-31 2006-12-31 2006-09-30 \
0 $5,672 $3,262 $2,886 $3,015 $3,986 $2,307
1 $4,501 $2,500 $2,185 $2,296 $3,135 $1,758
...
2006-06-30 2006-03-31 2005-12-31 2005-09-30 2005-06-30 2005-03-31
0 $2,139 $2,279 $2,977 $1,858 $1,753 $1,902
1 $1,630 $1,732 $2,309 $1,395 $1,303 $1,444

Categories

Resources