Trying make a loop with Selenium in Python - python

I have a code to search in this site --> https://osu.ppy.sh/beatmapsets?m=0 only maps with difficulty that i want, but i can't make a loop right
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
# Set link and path
driver = webdriver.Chrome(executable_path=r"C:\Users\Gabri\anaconda3\chromedriver.exe")
driver.get("https://osu.ppy.sh/beatmapsets?m=0")
wait = WebDriverWait(driver, 20)
# Variables, lists and accountants
lista = {}
links, difficulty, maps2, final = [], [], [], []
line, column, = 1, 1
link_test = ''
n = int(input('insert how many maps do you want: '))
c = 1
# Open link in Chrome and search map by map
while True:
if c > n:
break
sleep(1)
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, f".beatmapsets__items-row:nth-of-type(1)>.beatmapsets__item:nth-of-type(1)")))
games = driver.find_element_by_css_selector(
f".beatmapsets__items-row:nth-of-type({line}) .beatmapsets__item:nth-of-type({column}) .beatmapset-panel__info-row--extra")
actions = ActionChains(driver)
actions.move_to_element(games).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".beatmaps-popup__group")))
scores = driver.find_elements_by_css_selector(
".beatmaps-popup__group .beatmaps-popup-item__col.beatmaps-popup-item__col--difficulty")
# This part i can't makes automatic, for example, if i wanted to show 6 maps i would have to add 2 more if's
# Changing the variable (line) and (column) accordingly
# I liked to have a loop with 'while' or 'for ... in' but i don't know how make it
# I tried to do a question before start the code like 'how many maps do you want?' and this number would be the times that code would execute
# But no work it =(
if c % 2 != 0:
column = 2
if c % 2 == 0:
line += 1
else:
line += 1
column = 1
# Convert string to float (difficulty numbers)
for score in scores:
a = score.text
b = a.replace(',', '.')
difficulty.append(float(b))
# Save in list 'links' each link corresponding of map that is printing
games.click()
sleep(3)
link_test = driver.current_url
links.append(link_test)
link_test = ''
driver.back()
# Dict with map, link and difficulty
lista = {
'map': f"{c}",
'link': f"{links}",
'difficulty': f"{difficulty}"}
c += 1
# Print each map in dict 'lista'
print(f"Map: {lista['map']}\nLink: {links}\nDifficulty: {lista['difficulty']}\n")
# This part is my filter, if map have difficulty 6.00 or more, it's add to list 'final' for download
for b in difficulty:
if b >= 6.00:
# This slice, the link had printing error 'TypeError: unhashable type: 'list'', i found this way to solve it
# I know that is not the best way to solve this error, but at least i tried =,)
xam = str(links[0])
xam1 = xam.replace("'", '')
xam2 = xam1.replace("[", '')
xam3 = xam2.replace("]", '')
final.append(xam3)
# Clean all lists for no have duplicate items in dict 'lista' when next map is selected
difficulty.clear()
lista.clear()
links.clear()
# Print how many maps with difficulty 6.00 has been found
print(f'There are {len(sorted(set(final)))} maps to download')
# This question is for future download, im still coding this part, so u can ignore this =3
pergunta = input('Do you want to download them? \n[ Y ]\n[ N ]\n>>> ').lower().strip()
# Clean duplicate links and show all links already filtered
if pergunta == 'y':
for x in final:
maps2.append(x)
print(sorted(set(maps2)))
In 'if's' part, i need help to make it automatic, in a way that no uses to many 'if's' like i did. With variables that add themselves with 'v += n' maybe? Idk ;-;
PS-If you find any logic errors or some way to optimize my code I will be happy to learn and fix it

You're doing way more work than you have to. When you visit the page in a browser, and log your network traffic, everytime you scroll down to load more beatmaps you'll see some XHR (XmlHttpRequest) HTTP GET requests being made to a REST API, the response of which is JSON and contains all the beatmap information you could ever want. All you need to do is imitate that HTTP GET request - no Selenium required:
def get_beatmaps():
import requests
url = "https://osu.ppy.sh/beatmapsets/search"
params = {
"m": "0",
"cursor[approved_date]": "0",
"cursor[_id]": "0"
}
while True:
response = requests.get(url)
response.raise_for_status()
data = response.json()
cursor_id = data["cursor"]["_id"]
if cursor_id == params["cursor[_id]"]:
break
yield from data["beatmapsets"]
params["cursor[approved_date]"] = data["cursor"]["approved_date"]
params["cursor[_id]"] = cursor_id
def main():
from itertools import islice
num_beatmaps = 10 # Get info for first ten beatmaps
beatmaps = list(islice(get_beatmaps(), num_beatmaps))
for beatmap in beatmaps:
print("{} - {}".format(beatmap["artist"], beatmap["title"]))
for version in beatmap["beatmaps"]:
print(" [{}]: {}".format(version["version"], version["difficulty_rating"]))
print()
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Aitsuki Nakuru - Monochrome Butterfly
[Gibune's Insane]: 4.55
[Excitement]: 5.89
[Collab Extra]: 5.5
[Hard]: 3.54
[Normal]: 2.38
Sweet Trip - Chocolate Matter
[drops an end to all this disorder]: 4.15
[spoken & serafeim's hard]: 3.12
Aso Natsuko - More-more LOVERS!!
[SS!]: 5.75
[Sonnyc's Expert]: 5.56
[milr_'s Hard]: 3.56
[Dailycare's Insane]: 4.82
Takayan - Jinrui Mina Menhera
[Affection]: 4.43
[Normal]: 2.22
[Narrative's Hard]: 3.28
Asaka - Seize The Day (TV Size)
[Beautiful Scenery]: 3.7
[Kantan]: 1.44
[Seren's Oni]: 3.16
[XK's Futsuu]: 2.01
[ILOVEMARISA's Muzukashii]: 2.71
[Xavy's Seize The Moment]: 4.06
Swimy - Acchi Muite (TV Size)
[Look That Way]: 4.91
[Azu's Cup]: 1.72
[Platter]: 2.88
[Salad]: 2.16
[Sya's Rain]: 4.03
Nakazawa Minori (CV: Hanazawa Kana) - Minori no Zokkon Mirai Yohou (TV Size)
[Expert]: 5.49
[Normal]: 2.34
[Suou's Hard]: 3.23
[Suou's Insane]: 4.38
[Another]: 4.56
JIN - Children Record (Re:boot)
[Collab Hard]: 3.89
[Maki's Normal]: 2.6
[hypercyte & Seto's Insane]: 5.01
[Kagerou]: 6.16
Coalamode. - Nemophila (TV Size)
[The Hidden Dungeon Only I Can Enter]: 3.85
[Silent's Hard]: 3
[Normal]: 2.29
MISATO - Necro Fantasia
[Lunatic]: 6.06
>>>
The way this example is written now, it grabs the first ten beatmaps from the API, prints the artist and title, and the name and difficulty of each version of that beatmap. You can change it to suit your needs, and filter the output based on difficulty.
That being said, I don't know anything about OSU or beatmaps. If you could describe what the final output should actually look like, I can tailor my solution.

Before a lot of tests, i solve all my problem (for now hehe).
Just add
if c % 2 != 0:
column = 2
if c % 2 == 0:
line += 1
else:
line += 1
column = 1
I so thankful for all people that helped me =)))

Related

Attempting to Traverse a search API Collecting results in Python 3.9

I am attempting to yield all results from a search api.
There are 3 key cases:
Case 1: There are 0 results yielded
Case 2: There are 1-9 results yielded
Case 3: There are 10 results yielded
In both cases 1 and 2, we can conclude that there are no results deeper. i.e if we searched for "iiiiia" and it yields 0 then that means there are no results for "iiiiia" and no results deeper than "iiiiia". If "iiiiia" yields 1-9 results, then we can concluded "iiiiia" yields 109 results and no results deeper.
In case 3, if we search for "iiiiia" and it yields 10, we can conclude that "iiiiia" has 10 results, and there may or may not be results deeper as 10 means 10+ results possible. In this case we would have to move one layer down the chain and traverse through those results. i.e "iiiiia0"-"iiiiiaz" and if any of those yield 10 we would also have to go one layer deeper.
an example of how this may look:
a
a0
a00
a000
a0000
a0001
...
a000z
a001
a002
a003
a0030
...
a0034
a00340
...
a0034z
a0035...
Here is my attempted code:
import json
from time import sleep
import urllib3
import requests
import re
import random
import sys
import numpy
# Saving the reference of the standard output
original_stdout = sys.stdout
header = {}
link = '/search?query='
http = requests.session()
fileName = '/output.txt'
hasCompleted = 0
number = '11'
def search(target):
global targetList
global link
global header
global http
resp = http.request('GET',link+target,headers=header)
match = re.findall('"key":"(.+?)","',resp.text)
if len(match) == 10:
return False
if len(match) == 0:
return " "
elif len(match) < 10:
return resp.text
def treeSearch():
global fileName
global number
global hasCompleted
if hasCompleted == 1:
new = (int(int(number, 36) / 36) + 1)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 0
if hasCompleted == 0:
x = 0
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
while x < 37:
new = int(int(number, 36) + 1)
new = numpy.base_repr(new, 36)
number = new
result = search(number)
print(number)
if result:
with open(fileName, 'a+') as f:
sys.stdout = f
print(result)
sys.stdout = original_stdout
x = x + 1
else:
treeSearch()
temp = number
number = (int(int(number, 36) / 36) + 1) #maybe not + 1
print(number)
number = numpy.base_repr(number, 36)
print(number)
result = search(number)
if not result == " ":
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 1
treeSearch()
Here is my output:
111
1111
11111
111111
1111111
11111111
111111111
1111111111
11111111111
111111111111
1111111111111
11111111111111
111111111111111
1111111111111111
1111111111111112
1111111111111113
1111111111111114
1111111111111115
1111111111111116
1111111111111117
1111111111111118
1111111111111119
111111111111111A
111111111111111B
111111111111111C
111111111111111D
111111111111111E
111111111111111F
111111111111111G
111111111111111H
111111111111111I
111111111111111J
111111111111111K
111111111111111L
111111111111111M
111111111111111N
111111111111111O
111111111111111P
111111111111111Q
111111111111111R
111111111111111S
111111111111111T
111111111111111U
111111111111111V
111111111111111W
111111111111111X
111111111111111Y
111111111111111Z
1111111111111120
1111111111111121
6316397706306666889217
11111111110QR5T
11111111110QR5U
11111111110QR5V
11111111110QR5W
11111111110QR5X
11111111110QR5Y
11111111110QR5Z
11111111110QR60
11111111110QR61
11111111110QR62
11111111110QR63
11111111110QR64
11111111110QR65
11111111110QR66
11111111110QR67
11111111110QR68
11111111110QR69
11111111110QR6A
11111111110QR6B
11111111110QR6C
11111111110QR6D
11111111110QR6E
11111111110QR6F
11111111110QR6G
11111111110QR6H
11111111110QR6I
11111111110QR6J
11111111110QR6K
11111111110QR6L
11111111110QR6M
11111111110QR6N
11111111110QR6O
11111111110QR6P
11111111110QR6Q
11111111110QR6R
11111111110QR6S
11111111110QR6T
11111111110QR6U
175455491841851850753
11111111110L4X
11111111110L4Y
11111111110L4Z
11111111110L50
11111111110L51
11111111110L52
11111111110L53
11111111110L54
11111111110L55
11111111110L56
11111111110L57
11111111110L58
11111111110L59
11111111110L5A
11111111110L5B
11111111110L5C
11111111110L5D
11111111110L5E
11111111110L5F
11111111110L5G
11111111110L5H
11111111110L5I
11111111110L5J
11111111110L5K
11111111110L5L
11111111110L5M
11111111110L5N
11111111110L5O
11111111110L5P
11111111110L5Q
11111111110L5R
11111111110L5S
11111111110L5T
11111111110L5U
11111111110L5V
11111111110L5W
11111111110L5X
11111111110L5Y
4873763662273662977
11111111110XT
11111111110XU
11111111110XV
11111111110XW
11111111110XX
11111111110XY
11111111110XZ
11111111110Y0
11111111110Y1
11111111110Y2
11111111110Y3
11111111110Y4
11111111110Y5
11111111110Y6
11111111110Y7
11111111110Y8
11111111110Y9
11111111110YA
11111111110YB
11111111110YC
11111111110YD
11111111110YE
11111111110YF
11111111110YG
11111111110YH
11111111110YI
11111111110YJ
11111111110YK
11111111110YL
11111111110YM
11111111110YN
11111111110YO
11111111110YP
11111111110YQ
11111111110YR
11111111110YS
11111111110YT
11111111110YU
135382323952046193
11111111110X
11111111110Y
11111111110Z
111111111110
111111111111
111111111121
111111111122
111111111123
111111111124
111111111125
111111111126
111111111127
111111111128
111111111129
11111111112A
11111111112B
11111111112C
11111111112D
11111111112E
11111111112F
11111111112G
11111111112H
11111111112I
11111111112J
11111111112K
11111111112L
11111111112M
11111111112N
11111111112O
11111111112P
11111111112Q
11111111112R
11111111112S
11111111112T
11111111112U
11111111112V
11111111112W
11111111112X
11111111112Y
11111111112Z
111111111130
111111111131
3760620109779064
11111111114
111111111141
111111111142
111111111143
111111111144
111111111145
111111111146
111111111147
111111111148
111111111149
11111111114A
11111111114B
11111111114C
11111111114D
11111111114E
11111111114F
11111111114G
11111111114H
11111111114I
11111111114J
11111111114K
11111111114L
11111111114M
11111111114N
11111111114O
11111111114P
11111111114Q
11111111114R
11111111114S
11111111114T
11111111114U
11111111114V
11111111114W
11111111114X
11111111114Y
3760620109779066
11111111116
11111111117
11111111118
11111111119
1111111111A
1111111111B
1111111111C
1111111111D
1111111111E
1111111111F
1111111111G
1111111111H
1111111111I
1111111111J
1111111111K
1111111111L
1111111111M
1111111111N
1111111111O
1111111111P
1111111111Q
1111111111R
1111111111S
1111111111T
1111111111U
1111111111V
1111111111W
1111111111X
1111111111Y
1111111111Z
11111111120
11111111121
11111111122
11111111123
11111111124
11111111125
11111111126
11111111127
104461669716087
1111111113
1111111114
1111111115
1111111116
1111111117
1111111118
1111111119
111111111A
111111111B
111111111C
111111111D
111111111E
111111111F
111111111G
111111111H
111111111I
111111111J
111111111K
111111111L
111111111M
111111111N
111111111O
111111111P
111111111Q
111111111R
111111111S
111111111T
111111111U
111111111V
111111111W
111111111X
111111111Y
111111111Z
1111111120
1111111121
1111111122
1111111123
1111111124
2901713047671
111111113
111111114
111111115
111111116
111111117
111111118
111111119
11111111A
11111111B
11111111C
11111111D
11111111E
11111111F
11111111G
11111111H
11111111I
11111111J
11111111K
11111111L
11111111M
11111111N
11111111O
11111111P
11111111Q
11111111R
11111111S
11111111T
11111111U
11111111V
11111111W
11111111X
11111111Y
11111111Z
111111120
111111121
111111122
111111123
111111124
80603140215
11111113
11111114
11111115
11111116
11111117
11111118
11111119
1111111A
1111111B
1111111C
1111111D
1111111E
1111111F
1111111G
1111111H
1111111I
1111111J
1111111K
1111111L
1111111M
1111111N
1111111O
1111111P
1111111Q
1111111R
1111111S
1111111T
1111111U
1111111V
1111111W
1111111X
1111111Y
1111111Z
11111120
11111121
11111122
11111123
11111124
2238976119
1111113
11111131
11111132
11111133
11111134
11111135
11111136
11111137
11111138
11111139
1111113A
1111113B
1111113C
1111113D
1111113E
1111113F
1111113G
1111113H
1111113I
1111113J
1111113K
1111113L
1111113M
1111113N
1111113O
1111113P
1111113Q
1111113R
1111113S
1111113T
1111113U
1111113V
1111113W
1111113X
1111113Y
1111113Z
11111140
11111141
2238976121
1111115
11111151
11111152
11111153
11111154
11111155
11111156
11111157
11111158
11111159
1111115A
1111115B
1111115C
1111115D
1111115E
1111115F
1111115G
1111115H
1111115I
1111115J
1111115K
1111115L
1111115M
1111115N
1111115O
1111115P
1111115Q
1111115R
1111115S
1111115T
1111115U
1111115V
1111115W
1111115X
1111115Y
1111115Z
11111160
11111161
2238976123
1111117
11111171
11111172
11111173
11111174
11111175
11111176
11111177
11111178
11111179
1111117A
1111117B
1111117C
1111117D
1111117E
1111117F
1111117G
1111117H
1111117I
1111117J
1111117K
1111117L
1111117M
1111117N
1111117O
1111117P
1111117Q
1111117R
1111117S
1111117T
1111117U
1111117V
1111117W
1111117X
1111117Y
1111117Z
11111180
11111181
2238976125
1111119
111111A
111111B
111111C
111111D
111111E
111111F
111111G
111111H
111111I
111111J
111111K
111111L
111111M
111111N
111111O
111111P
111111Q
111111R
111111S
111111T
111111U
111111V
111111W
111111X
111111Y
111111Z
1111120
1111121
1111122
1111123
1111124
1111125
1111126
1111127
1111128
1111129
111112A
62193783
111113
1111131
1111132
1111133
1111134
1111135
1111136
1111137
1111138
1111139
111113A
111113B
111113C
111113D
111113E
111113F
111113G
111113H
111113I
111113J
111113K
111113L
111113M
111113N
111113O
111113P
111113Q
111113R
111113S
111113T
111113U
111113V
111113W
111113X
111113Y
111113Z
1111140
1111141
62193785
111115
My code traverses in only deeper once then comes out, I will keep working on my code however I am hoping to find an easier solution or possibly a library that can perform this style of search. Thanks!

BeautifulSoup trying to get text from wrapped divs but empty or "none" is being returned

Here is a picture (sorry) of the HTML that I am trying to parse:
I am using this line:
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
Thinking that I'd get the 1st child of the class statText and the outcome would be 53%.
But it's not. I get "Loading..." and none of the data that I was trying to use and display.
The full code I have so far:
soup = BeautifulSoup(source, 'lxml')
home_team = soup.find('div', class_='tname-home').a.text
away_team = soup.find('div', class_='tname-away').a.text
home_score = soup.select_one('.current-result .scoreboard:nth-child(1)').text
away_score = soup.select_one('.current-result .scoreboard:nth-child(2)').text
print("The home team is " + home_team, "and they scored " + home_score)
print()
print("The away team is " + away_team, "and they scored " + away_score)
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
print(home_stats)
Which currently does print the hone and away team and the number of goals they scored. But I can't seem to get any of the statistical content from this site.
My output plan is to have:
[home_team] had 53% ball possession and [away_team] had 47% ball possession
However, I would like to remove the "%" symbols from the parse (but that's not essential). My plan is to use these numbers for more stats later on, so the % symbol gets in the way.
Apologies for the noob question - this is the absolute beginning of my Pythonic journey. I have scoured the internet and StackOverflow and just can not find this situation - I also possibly don't know exactly what I am looking for either.
Thanks kindly for your help! May your answer be the one I pick as "correct" ;)
Assuming that this is the website that u r tryna scrape, here is the complete code to scrape all the stats:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.scoreboard.com/en/match/SO3Fg7NR/#match-statistics;0')
pg = driver.page_source #Gets the source code of the page
driver.close()
soup = BeautifulSoup(pg,'html.parser') #Creates a soup object
statrows = soup.find_all('div',class_ = "statTextGroup") #Finds all the div tags with class statTextGroup -- these div tags contain the stats
#Scrapes the team names
teams = soup.find_all('a',class_ = "participant-imglink")
teamslst = []
for x in teams:
team = x.text.strip()
if team != "":
teamslst.append(team)
stats_dict = {}
count = 0
for x in statrows:
txt = x.text
final_txt = ""
stat = ""
alphabet = False
percentage = False
#Extracts the numbers from the text
for c in txt:
if c in '0123456789':
final_txt+=c
else:
if alphabet == False:
final_txt+= "-"
alphabet = True
if c != "%":
stat += c
else:
percentage = True
values = final_txt.split('-')
#Appends the values to the dictionary
for x in values:
if stat in stats_dict.keys():
if percentage == True:
stats_dict[stat].append(x + "%")
else:
stats_dict[stat].append(int(x))
else:
if percentage == True:
stats_dict[stat] = [x + "%"]
else:
stats_dict[stat] = [int(x)]
count += 1
if count == 15:
break
index = [teamslst[0],teamslst[1]]
#Creates a pandas DataFrame out of the dictionary
df = pd.DataFrame(stats_dict,index = index).T
print(df)
Output:
Burnley Southampton
Ball Possession 53% 47%
Goal Attempts 10 5
Shots on Goal 2 1
Shots off Goal 4 2
Blocked Shots 4 2
Free Kicks 11 10
Corner Kicks 8 2
Offsides 2 1
Goalkeeper Saves 0 2
Fouls 8 10
Yellow Cards 1 0
Total Passes 522 480
Tackles 15 12
Attacks 142 105
Dangerous Attacks 44 29
Hope that this helps!
P.S: I actually wrote this code for a different question, but I didn't post it as an answer was already posted! But I didn't know that it would come in handy now! Anyways, I hope that my answer does what u need.

webscraping with selenium to click a button and grab everything

I have been working on this scraper for a while and I think it could be improved but I'm not sure where to go from here.
The initial scraper looks like this and I believe it does everything I need it to do:
url = "https://matrix.heartlandmls.com/Matrix/Public/Portal.aspx?L=1&k=990316X949Z&p=DE-74613894-421"
h_table = []
driver = webdriver.Firefox()
driver.get(url)
driver.find_element_by_xpath("/html/body/form/div[3]/div/div/div[5]/div[3]/span[2]/div/div/div[2]/div[1]/div/div/div[2]/div[2]/div[1]/span/a").click()
time.sleep(10)
i = 200
while i > 0:
h_table.append(driver.find_element_by_id("wrapperTable").text)
driver.find_element_by_xpath("/html/body/form/div[3]/div/div/div[5]/div[2]/div/div[1]/div/div/span/ul/li[2]/a").click()
time.sleep(10)
i -= 1
this outputs everything into a table which i can clean up
['210 Sitter Street\nPleasant Hill, MO 64080\nMLS#:2178982\nMap\n$15,000\nSold\n4Bedrms\n2Full Bath(s)\n0Half Bath(s)\n1,848Sqft\nBuilt in1950\n0.27Acres\nSingle Family\n1 / 10\nThis Home sits on a level, treed, and nice .279 acre sizeable double lot. The property per taxes, is identified as a Single Family Home however it has 2 separate utility meters and 2 living spaces, each with 2 bedrooms and 1 full bath and laundry areas, and was utilized as a Duplex for Rental income for 2 units. This property is a CASH ONLY sale and is being sold "In It\'s Present Condition". Home and detached garage are in need of repair OR would be a candidate for a tear down and complete rebuild on the lot.\nAbout 210 Sitter Street, Pleasant Hill, MO 64080\nDirections:I-70 to 7 Hwy, to Broadway, to Sitter St, to property.\nGeneral Description\nMLS Number\n2178982\nCounty\nCass\nCity\nPleasant Hill\nSub Div\nWalkers & Sitlers\nType\nSingle Family\nFloor Plan Description\nRanch\nBdrms\n4\nBaths Full\n2\nBaths Half\n0\nAge Description\n51-75 Years\nYear Built\n1950\nSqft Main\n1848\nSQFT MAIN SOURCE\nPublic Record\nBelow Grade Finished Sq Ft\n0\nBelow Grade Finished Sq Ft Source\nPublic Record\nSqft\n1848\nLot Size\n12,155\nAcres\n0.27\nSchools E\nPleasant Hill Prim\nSchools M\nPleasant Hill\nSchools H\nPleasant Hill\nSchool District\nPleasant Hill\nLegal Description\nWALKER & SITLERS LOT 47 & 48 BLK 5\nS Terms\nCash\nInterior Features\nFireplace?\nY\nFireplace Description\nLiving Room, Wood Burning\nBasement\nN\nBasement Description\nBlock, Crawl Space\nDining Area Description\nEat-In Kitchen\nUtility Room\nMultiple, Main Level\nInterior Features\nFixer Up\nRooms\nBathroom Full\nLevel 1\n2nd Full Bath\nLevel 1\nMaster Bedroom\nLevel 1\nSecond Bedroom\nLevel 1\nMaster BR- 2nd\nLevel 1\nFourth Bedroom\nLevel 1\nKitchen\nLevel 1\nKitchen- 2nd\nLevel 1\nLiving Room\nLevel 1\nFamily Rm- 2nd\nLevel 1\nExterior / Construction\nGarage/Parking?\nY\nGarage/Parking #\n2\nGarage Description\nDetached, Front Entry\nConstruction\nFrame\nArchitecture\nTraditional\nRoof\nComposition\nLot Description\nCity Limits, City Lot, Level, Treed\nIn Floodplain\nNo\nInside City Limits\nYes\nStreet Maintenance\nPub Maint, Paved\nExterior Features\nFixer Up\nUtility Information\nCentral Air\nY\nHeat\nForced Air Gas\nCool\nCentral Electric, Window Unit(s)\nWater\nCity/Public\nSewer\nCity/Public\nFinancial Information\nS Terms\nCash\nHoa Amount\n$0\nTax\n$1,066\nSpecial Tax\n$0\nTotal Tax\n$1,066\nExclusions\nEntire Property\nType Of Ownership\nPrivate\nWill Sell\nCash\nAssessment & Tax\nAssessment Year\n2019\n2018\n2017\nAssessed Value - Total\n$17,240\n$15,380\n$15,380\nAssessed Value - Land\n$2,400\n$1,920\n$1,920\nAssessed Value - Improved\n$14,840\n$13,460\n$13,460\nYOY Change ($)\n$1,860\n$\nYOY Change (%)\n12%\n0%\nTax Year\n2019\n2018\n2017\nTotal Tax\n$1,178.32\n$1,065.64\n$1,064.30\nYOY Change ($)\n$113\n$1\nYOY Change (%)\n11%\n0%\nNotes for you and your agent\nAdd Note\nMap data ©2020\nTerms of Use\nReport a map error\nMap\n200 ft \nParcel Disclaimer'
however, I had seen some other examples with WebDriverWait, but so far I have been unsuccessful, I think it would greatly speed up the scraper, here's the code I wrote
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "https://matrix.heartlandmls.com/Matrix/Public/Portal.aspx?L=1&k=990316X949Z&p=DE-74613894-421"
h_table = []
xpath = '/html/body/form/div[3]/div/div/div[5]/div[2]/div/div[1]/div/div/span/ul/li[2]/a'
driver = webdriver.Firefox()
driver.get(url)
driver.find_element_by_xpath("/html/body/form/div[3]/div/div/div[5]/div[3]/span[2]/div/div/div[2]/div[1]/div/div/div[2]/div[2]/div[1]/span/a").click()
time.sleep(10)
while True:
button = driver.find_elements_by_xpath("/html/body/form/div[3]/div/div/div[5]/div[2]/div/div[1]/div/div/span/ul/li[2]/a")
if len(button) < 1:
print('done')
break
else:
h_table.append(driver.find_element_by_id("wrapperTable").text)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, 'xpath'))).click()
this seems to give all the results, but it gives duplicates and I couldn't stop it without a keyboard interrupt
calling len(h_table) = 258, where it should be 200
if the length of your list is the problem, why not using :
if len(h_table) >= 200:
print("done")
break

How to limit number of rows that fill a dataframe in a for loop

I have written the following function that scrapes multiple pages from a website. I only want to get the first 20 or so pages. How can I limit the number of rows that I fill in my dataframe:
def scrape_page(poi,page_name):
base_url="https://www.fake_website.org/"
report_url=(base_url+poi)
page=urlopen(report_url)
experiences=BeautifulSoup(page,"html.parser")
empty_list=[]
for link in experiences.findAll('a', attrs={'href': re.compile(page_name+".shtml$")}):
url=urljoin(base_url, link.get("href"))
subpage=urlopen(url)
expages=BeautifulSoup(subpage, "html.parser")
for report in expages.findAll('a', attrs={'href': re.compile("^/experiences/exp")}):
url=urljoin(base_url, report.get("href"))
reporturlopen=urlopen(url)
reporturl=BeautifulSoup(reporturlopen, "html.parser")
book_title= reporturl.findAll("div",attrs={'class':'title'})
for i in book_title:
title=i.get_text()
book_genre= reporturl.findAll("div",attrs={'class':'genre'})
for i in book_genre:
genre=i.get_text()
book_author= reporturl.findAll("div",attrs={'class':'author'})
for i in book_author:
author=i.get_text()
author = re.sub("by", "",author)
empty_list.append({'title':title,'genre':genre,'author':author})
setattr(sys.modules[__name__], '{}_df'.format(poi+"_"+page_name), empty_list)
You can for example add a while loop:
i = 0
while i < 20:
< insert your code >
i += 1

Issues on dividing html part through 'tr' tag using Selenium Python

I tried to collect the data from this page (http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber) using Selenium Python 3.6. What I tried to do is to divide the section into two and collect the data from each part.
The part is like below:
Those items in the two parts are made of 39 'tr' tags. I select 0 to 14th 'tr' tags for the first part and 15th to the end 'tr'tags for the second part. But the first part already called up to the last 'tr' tag. I don't understand why it happened.
Below is my code:
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
from urllib.parse import quote
from selenium import webdriver
import re
import time
popup_inspection = "http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber"
driver = webdriver.PhantomJS()
driver.set_window_size(500, 300)
driver.get(popup_inspection)
soup_inspection = BeautifulSoup(driver.page_source, "html.parser")
count = 0 # for loop count
count_insp = 0 # 누유 및 오작동
count_in = 0 # 골격
count_out = 0 # 외관
insp_tables = soup_inspection.find_all('table', class_=True)
for insp_table in insp_tables[4].find_all('tr'):
labels = insp_table.find_all('td', class_="center")
for label in labels[:15]:
if label.find("input", type="checkbox", checked=True):
count_out += 1
print (label.text)
else:
print(label.text)
print("외관 이상 수: ", count_out)
for label in labels[16:]:
if label.find("input", type="checkbox", checked=True):
count_in += 1
print (label.text)
else:
print(label.text)
print("골격 이상 수: ", count_in)
The result I would like to have is like below:
<Upper Part>
1 후드 0 0
2 프론트 휀더(좌) 0 0
......
8 트렁크 리드 1 0
Total : 1 0
<Lower Part>
1 프론트 패널
2 크로스 멤버
....
22 리어 패널 1 0
23 트렁크 플로어 0 0
Total : 1 0
Please help me to work this out.
Thanks.

Categories

Resources