I want to crawl all advertisements but output is "unmatched '}'". Is there any easy way to do it? I tried Beautifulsoup before but I think It's not correct way to do it or I'm using it wrong way.
How can I scrape all '199 yeni tikili binalar' from the website.
from ast import literal_eval
from bs4 import BeautifulSoup as bs
import requests
import re
import json
import requests
import pandas as pd
from ast import literal_eval
url = "https://korter.az/yasayis-kompleksleri-baku"
html_doc = requests.get(url).text
data = re.search(r'2804\.jpg"\}\}\}\],(".*")', html_doc).group(1)
data = json.loads(literal_eval(data))
df = pd.DataFrame(data)
df.to_excel('korter.xlsx', index=False)
The site has an api which can be accessed by request
Url of the API is : "https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
Full Code
import requests
import math
import pandas as pd
def roundup(x):
return int(math.ceil(x / 20.0)) * 20
# Gettig no of results
url1 = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
r = requests.get(url1)
no_of_outcomes = r.json()["totalBuildingsCount"]
# since the data is 199 i am rounding up to 20 since i will divide no of outcomes by 20 as the api only provides with 20 results at a time
no_of_outcomes = roundup(no_of_outcomes)
# Getting Sub Url from each Page by looping.
result_url = []
previous_subdata = []
for k in range(1, int(no_of_outcomes/20)+1):
url = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page={k}&lang=az-AZ&locale=az-AZ"
r = requests.get(url)
subdata = r.json()["buildings"]
for i in subdata:
suburl = "https://korter.az"+i["url"]
result_url.append(suburl)
print(len(result_url))
df = pd.DataFrame(result_url)
print(df)
Output
199
0
0 https://korter.az/toca-residence-baki
1 https://korter.az/malibu-residence-baki
2 https://korter.az/zirve-park-baki
3 https://korter.az/melissa-park-baki
4 https://korter.az/white-hotel-baki
.. ...
194 https://korter.az/yasham-boulevard-baki
195 https://korter.az/koroglu-baki
196 https://korter.az/luxor-palace-baki
197 https://korter.az/shirvanshahlar-residence-baki
198 https://korter.az/baki-baglari-baki
[199 rows x 1 columns]
Hope this helps. Happy Coding :)
Related
# Import libs
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# Form Data for passing to the request body
formdata = {'objid': '14'}
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in range(1, 15):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
df = pd.DataFrame(bat["ops1"])
df.to_csv(str(i) + ".csv")
Right now this query creates 14 csv files. What I wanted is, the for loop to remove the first row of column headers and append the data to a dataframe I created outside the for loop. so that I can get it as single csv file.
I am using BS and Pandas.
This is one way of achieving your goal:
# Import libs
import pandas as pd
import requests
from tqdm import tqdm ## if using jupyter: from tqdm.notebook import tqdm
final_df = pd.DataFrame()
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in tqdm(range(1, 15)):
formdata = {'objid': i}
r = requests.post(url, data=formdata)
df = pd.json_normalize(r.json()["ops1"])
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
final_df.to_csv('some_data_saved.csv')
print(final_df)
Data will be saved to a csv file, and also printed in terminal:
100%
14/14 [00:14<00:00, 1.05s/it]
value text
0 8o7LEdvX2e G14001-Kumbadaje
1 jw2XOQyZ4K G14002-Bellur
2 0lMB1O4LbV G14003-Karadka
3 zodLro2Z39 G14004-Muliyar
4 dWxLYn8ZME G14005-Delampady
... ... ...
1029 Qy6Z09bBKE G01073-Ottoor
1030 ywoXG8wLxV M01001-Neyyattinkara
1031 Kk8Xvz7XO9 M01002-Nedumangad
1032 r7eXQYgX8m M01003-Attingal
1033 b3KXlO2B8g M01004-Varkala
1034 rows × 2 columns
Requests can return responses in JSON format, so you don;t need to import bs4 & json.
For TQDM, please see https://pypi.org/project/tqdm/
For pandas documentation, visit https://pandas.pydata.org/docs/
Also for Requests: https://requests.readthedocs.io/en/latest/
I would use a function to get the data and return a DataFrame, then use it within concat:
def get_data(i):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
return pd.DataFrame(bat["ops1"])
df = pd.concat([get_data(i) for i in range(1, 15)])
df.to_csv('all_data.csv')
NB. if this gives you unsatisfactory results, please provide a short extract of 2/3 dataframes and the expected merged output.
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
table_data = doc.findAll("td")
#for i in table_data:
#towns.append(table_data[i])
print(table_data)
I'm trying to get the data from the tables, like numbers of adherents to certain religions, ethnic groups, etc. When I look at the source page all that stuff is between the td tags but I'm not seeing it when I print out table_data. What am I doing wrong?
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
dfs = pd.read_html(page)
for x in dfs:
print(x) ## do what you will with the data
For instance, the religions would be table 17 (dfs[17]):
Religion Adherents Congregations
0 Orthodox 754 6
1 Evangelical Protestant 232 3
2 Catholic 185 1
3 Other 112 1
4 Mainline Protestant 82 1
5 None 4196 -
EDIT: Given the OP's insurmountable issues with his python install, a workaround would be:
url = "https://www.city-data.com/city/Adak-Alaska.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
for x in soup.select('table'):
for z in x.select('tr'):
print([y.text.strip() for y in z.find_all(['td', 'th'])])
print('________________')
Results can be further transformed in dataframes.
This is my code and I want to compare the goals and xG column and print only those rows which satisfy the condition goals > xG.
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
pd.set_option("max_colwidth", 99999)
pd.set_option("max_rows", 99999)
url = "https://understat.com/league/EPL"
page_connect = urlopen(url)
page_html = BeautifulSoup(page_connect, "html.parser")
raw_string = page_html.findAll(name="script")[3].text
start_ind = raw_string.index("\\")
stop_ind = raw_string.index("')")
json_data = raw_string[start_ind:stop_ind]
json_data = json_data.encode("utf8").decode("unicode_escape")
final_json_df = pd.json_normalize(json.loads(json_data))
a = final_json_df[final_json_df.shots == 0]
final_json_df = final_json_df.astype({"goals" : 'float'})
final_json_df = final_json_df.astype({"xG" : 'float'})
I tried this:
final_json_df[final_json_df.goals>xG]
but it doesn't seem to work. It would be helpful if someone can give the solution and explain why final_json_df[final_json_df.goals>xG] doesn't work.
I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.
i have the following code:
import pandas as pd
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
url = 'http://windte2001.acepta.com/v01/E67EBB4910CFDCB067EB7D85FBA6E5511D0E64A9'.replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
result = soup.find_all(['NmbItem','QtyItem'])
that brings the next result in xml
<NmbItem>SERV. MANEJO DE LIQUIDOS</NmbItem>, <QtyItem>22.00</QtyItem>, <NmbItem>SERV. MANEJO DE RESPEL</NmbItem>, <QtyItem>1.00</QtyItem>]
All i need if NmbItem contains 'LIQUIDOS' bring me the 'QtyItem' in this case is 22
How can i do this with python in this xml?
You can use regular expression.
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(new,'xml')
result=soup.find('NmbItem',text=re.compile("LIQUIDOS")).find_next('QtyItem').text
print(result)
You can do like this:
result = soup.find_all(['NmbItem'])
for item in result:
if 'LIQUIDOS' in item.text:
print(list(item.next_siblings)[3].text)