I get error "unmatched '}'" when I scrape website(korter.az) - python

I want to crawl all advertisements but output is "unmatched '}'". Is there any easy way to do it? I tried Beautifulsoup before but I think It's not correct way to do it or I'm using it wrong way.
How can I scrape all '199 yeni tikili binalar' from the website.
from ast import literal_eval
from bs4 import BeautifulSoup as bs
import requests
import re
import json
import requests
import pandas as pd
from ast import literal_eval
url = "https://korter.az/yasayis-kompleksleri-baku"
html_doc = requests.get(url).text
data = re.search(r'2804\.jpg"\}\}\}\],(".*")', html_doc).group(1)
data = json.loads(literal_eval(data))
df = pd.DataFrame(data)
df.to_excel('korter.xlsx', index=False)

The site has an api which can be accessed by request
Url of the API is : "https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
Full Code
import requests
import math
import pandas as pd
def roundup(x):
return int(math.ceil(x / 20.0)) * 20
# Gettig no of results
url1 = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
r = requests.get(url1)
no_of_outcomes = r.json()["totalBuildingsCount"]
# since the data is 199 i am rounding up to 20 since i will divide no of outcomes by 20 as the api only provides with 20 results at a time
no_of_outcomes = roundup(no_of_outcomes)
# Getting Sub Url from each Page by looping.
result_url = []
previous_subdata = []
for k in range(1, int(no_of_outcomes/20)+1):
url = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page={k}&lang=az-AZ&locale=az-AZ"
r = requests.get(url)
subdata = r.json()["buildings"]
for i in subdata:
suburl = "https://korter.az"+i["url"]
result_url.append(suburl)
print(len(result_url))
df = pd.DataFrame(result_url)
print(df)
Output
199
0
0 https://korter.az/toca-residence-baki
1 https://korter.az/malibu-residence-baki
2 https://korter.az/zirve-park-baki
3 https://korter.az/melissa-park-baki
4 https://korter.az/white-hotel-baki
.. ...
194 https://korter.az/yasham-boulevard-baki
195 https://korter.az/koroglu-baki
196 https://korter.az/luxor-palace-baki
197 https://korter.az/shirvanshahlar-residence-baki
198 https://korter.az/baki-baglari-baki
[199 rows x 1 columns]
Hope this helps. Happy Coding :)

Related

Appending Dataframe to another dataframe with first row removed

# Import libs
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# Form Data for passing to the request body
formdata = {'objid': '14'}
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in range(1, 15):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
df = pd.DataFrame(bat["ops1"])
df.to_csv(str(i) + ".csv")
Right now this query creates 14 csv files. What I wanted is, the for loop to remove the first row of column headers and append the data to a dataframe I created outside the for loop. so that I can get it as single csv file.
I am using BS and Pandas.
This is one way of achieving your goal:
# Import libs
import pandas as pd
import requests
from tqdm import tqdm ## if using jupyter: from tqdm.notebook import tqdm
final_df = pd.DataFrame()
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in tqdm(range(1, 15)):
formdata = {'objid': i}
r = requests.post(url, data=formdata)
df = pd.json_normalize(r.json()["ops1"])
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
final_df.to_csv('some_data_saved.csv')
print(final_df)
Data will be saved to a csv file, and also printed in terminal:
100%
14/14 [00:14<00:00, 1.05s/it]
value text
0 8o7LEdvX2e G14001-Kumbadaje
1 jw2XOQyZ4K G14002-Bellur
2 0lMB1O4LbV G14003-Karadka
3 zodLro2Z39 G14004-Muliyar
4 dWxLYn8ZME G14005-Delampady
... ... ...
1029 Qy6Z09bBKE G01073-Ottoor
1030 ywoXG8wLxV M01001-Neyyattinkara
1031 Kk8Xvz7XO9 M01002-Nedumangad
1032 r7eXQYgX8m M01003-Attingal
1033 b3KXlO2B8g M01004-Varkala
1034 rows × 2 columns
Requests can return responses in JSON format, so you don;t need to import bs4 & json.
For TQDM, please see https://pypi.org/project/tqdm/
For pandas documentation, visit https://pandas.pydata.org/docs/
Also for Requests: https://requests.readthedocs.io/en/latest/
I would use a function to get the data and return a DataFrame, then use it within concat:
def get_data(i):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
return pd.DataFrame(bat["ops1"])
df = pd.concat([get_data(i) for i in range(1, 15)])
df.to_csv('all_data.csv')
NB. if this gives you unsatisfactory results, please provide a short extract of 2/3 dataframes and the expected merged output.

Trying to web scrape all the tables on a web page

import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
table_data = doc.findAll("td")
#for i in table_data:
#towns.append(table_data[i])
print(table_data)
I'm trying to get the data from the tables, like numbers of adherents to certain religions, ethnic groups, etc. When I look at the source page all that stuff is between the td tags but I'm not seeing it when I print out table_data. What am I doing wrong?
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
dfs = pd.read_html(page)
for x in dfs:
print(x) ## do what you will with the data
For instance, the religions would be table 17 (dfs[17]):
Religion Adherents Congregations
0 Orthodox 754 6
1 Evangelical Protestant 232 3
2 Catholic 185 1
3 Other 112 1
4 Mainline Protestant 82 1
5 None 4196 -
EDIT: Given the OP's insurmountable issues with his python install, a workaround would be:
url = "https://www.city-data.com/city/Adak-Alaska.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
for x in soup.select('table'):
for z in x.select('tr'):
print([y.text.strip() for y in z.find_all(['td', 'th'])])
print('________________')
Results can be further transformed in dataframes.

Comparison between 2 columns in Pandas and printing only those rows that satisfy the condition

This is my code and I want to compare the goals and xG column and print only those rows which satisfy the condition goals > xG.
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
pd.set_option("max_colwidth", 99999)
pd.set_option("max_rows", 99999)
url = "https://understat.com/league/EPL"
page_connect = urlopen(url)
page_html = BeautifulSoup(page_connect, "html.parser")
raw_string = page_html.findAll(name="script")[3].text
start_ind = raw_string.index("\\")
stop_ind = raw_string.index("')")
json_data = raw_string[start_ind:stop_ind]
json_data = json_data.encode("utf8").decode("unicode_escape")
final_json_df = pd.json_normalize(json.loads(json_data))
a = final_json_df[final_json_df.shots == 0]
final_json_df = final_json_df.astype({"goals" : 'float'})
final_json_df = final_json_df.astype({"xG" : 'float'})
I tried this:
final_json_df[final_json_df.goals>xG]
but it doesn't seem to work. It would be helpful if someone can give the solution and explain why final_json_df[final_json_df.goals>xG] doesn't work.

How i can get random object from list in Python

I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.

Get data of a XML with criteria

i have the following code:
import pandas as pd
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
url = 'http://windte2001.acepta.com/v01/E67EBB4910CFDCB067EB7D85FBA6E5511D0E64A9'.replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
result = soup.find_all(['NmbItem','QtyItem'])
that brings the next result in xml
<NmbItem>SERV. MANEJO DE LIQUIDOS</NmbItem>, <QtyItem>22.00</QtyItem>, <NmbItem>SERV. MANEJO DE RESPEL</NmbItem>, <QtyItem>1.00</QtyItem>]
All i need if NmbItem contains 'LIQUIDOS' bring me the 'QtyItem' in this case is 22
How can i do this with python in this xml?
You can use regular expression.
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(new,'xml')
result=soup.find('NmbItem',text=re.compile("LIQUIDOS")).find_next('QtyItem').text
print(result)
You can do like this:
result = soup.find_all(['NmbItem'])
for item in result:
if 'LIQUIDOS' in item.text:
print(list(item.next_siblings)[3].text)

Categories

Resources