My problem is, it loops through pages, but it doesn't write anything into my list.
At the end I print len(title) and it is still 0.
from bs4 import BeautifulSoup
import requests
for page in range(20, 200, 20):
current_page = 'https://auto.bazos.sk/{}/?hledat=kolesa&hlokalita=&humkreis=&cen'.format(page)
web_req = requests.get(current_page).text
soup = BeautifulSoup(requests.get(current_page).content, 'html.parser')
title_data = soup.select('.nadpis')
title = []
for each_title in title_data:
title.append(each_title.text)
print(current_page)
print(len(title))
Move title out of the loop and there you have it.
import requests
from bs4 import BeautifulSoup
title = []
for page in range(20, 40, 20):
current_page = 'https://auto.bazos.sk/{}/?hledat=kolesa&hlokalita=&humkreis=&cen'.format(page)
soup = BeautifulSoup(requests.get(current_page).content, 'html.parser')
title_data = soup.select('.nadpis')
for each_title in title_data:
title.append(each_title.text)
print(current_page)
print(title)
Output:
['ELEKTRONY SKODA OCTAVIA SCOUT DISKY “PROTEUS” R17', 'Fiat Sedici 1.6, 4x4, r.v 04/2009, 79 kw, slovenské ŠPZ', 'Bmw e46 328ci', '255/50 R19', 'Honda Jazz 1.3', 'Predám 4 ks kolesá', 'Audi A5 3.2 FSI quattro tiptronic S LINE R20 TOP STAV', 'Peugeot 407 combi 1,6 hdi', 'Škoda Superb 2.0TDI 4x4 od 260€ mesačne, bez akontácia', 'Predam elektrony Audi 5x112 R17 a letne pneu', 'ROZPREDÁM MAZDA 3 2.0i 110kW NA NÁHRADNÉ DIELY', 'Predám Astra j Turbo Noblesse bronz', 'ŠKODA KAROQ 1.6 TDI - full výbava', 'VW CHICAGO 5x112 + letné pneu 215/40 R18', 'Fiat 500 SPORT 1.3 multijet 70kw', 'Volvo FL280 - TROJSTRANNÝ SKLÁPAČ + HYDRAULICKÁ RUKA', 'ŠKODA SUPERB COMBI 2.0 TDI 190K 4X4 L&K DSG', 'FORD FOCUS 2.0 TDCI TITANIUM', 'FORD EDGE 2.0 TDCi - 154 kW VIGNALE : 27.000 km', 'R18 5x112 originalne Vw Seat Audi Skoda']
How do I decode the following response from this url in python? https://www.scorespro.com/livescore/ajax0.php
1599071734^^~~##Wed 02 Sep 21:35 GMT +03^^~~##2361498-1##194837##0##2020-09-02 17:00:00##76##1##18032##17842##Club Friendly##0##Real Sociedad##Villarreal##CLB##un##FG##0-2##0-2##2 HF##Friendly Games##1599066000######2##99######real-sociedad-vs-villarreal/02-09-2020##friendly-games##club-friendly##2##LEAGUE##2020##round-1##0####0##1599066240##############0##0 2325164-1##196097##0##2020-09-02 17:00:00##71##1##105187##104946##Canadian Premier League - Premier League##0##Valour FC (5)##HFX Wanderers FC (6)##PL##ca##CAN##0-2##0-2##2 HF##Canada##1599066000######2##99######valour-fc-vs-hfx-wanderers-fc/30-05-2020##canada##premier-league##1##LEAGUE##2020##round-1##1##canadian-premier-league##0##1599066540##############0##0 2338959-1##197065##0##2020-09-02 17:00:00##81##4##39942##41961##Regionalliga Nordost##0##Germania Halberstadt (18)##Optik Rathenow (20)##N/E##de##GER##0-2##0-0##2 HF##Germany##1599066000######2##99######germania-halberstadt-vs-optik-rathenow/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##1599065940##############0##0 2338955-1##197065##0##2020-09-02 17:00:00##81##4##44124##56097##Regionalliga Nordost##0##Viktoria Berlin (2)##VSG Altglienicke (1)##N/E##de##GER##2-1##1-1##2 HF##Germany##1599066000######2##99######viktoria-berlin-vs-vsg-altglienicke/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##1599065940##############0##0 2338958-1##197065##0##2020-09-02 17:00:00##78##4##13847##3034##Regionalliga Nordost##0##SV Babelsberg 03 (12)##Chemnitzer FC (15)##N/E##de##GER##2-2##1-1##2 HF##Germany##1599066000######2##99######sv-babelsberg-03-vs-chemnitzer-fc/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##1599066120##############0##0 2338954-1##197065##0##2020-09-02 17:00:00##79##4##21173##37508##Regionalliga Nordost##0##Hertha Berlin II (5)##Berliner AK 07 (16)##N/E##de##GER##2-5##1-2##2 HF##Germany##1599066000######2##99######hertha-berlin-ii-vs-berliner-ak-07/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##1599066060##############0##0 2361307-1##197664##0##2020-09-02 17:00:00##81##1##24981##21152##Landspokal##0##Slagelse##Dalum##CUP##dk##DEN##1-1##0-0##2 HF##Denmark##1599066000######2##99######slagelse-vs-dalum/01-09-2020##denmark##fa-cup##6##PHASE##2020-2021##round-1##0####0##1599065940##2.20##2.87##3.75########0##0 2338953-1##197065##0##2020-09-02 17:00:00##80##4##41959##2993##Regionalliga Nordost##0##VfB Auerbach (7)##Energie Cottbus (19)##N/E##de##GER##2-4##1-2##2 HF##Germany##1599066000######2##99######vfb-auerbach-vs-energie-cottbus/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##1599066000##############0##0 2307988-1##195163##0##2020-09-02 17:15:00##62##10##34050##49891##Serie A - First Stage##0##CD Olmedo (16)##Delfin SC (10)##SA1##ec##ECU##2-0##2-0##2 HF##Ecuador##1599066900######2##99######cd-olmedo-vs-delfin-sc/10-05-2020##ecuador##first-stage##1##LEAGUE##2020##round-10##1##serie-a##0##1599067080##2.45##2.55##3.25########1##1 2338956-1##197065##0##2020-09-02 17:30:00##HT##4##41960##50882##Regionalliga Nordost##0##Lokomotive Leipzig (13)##FSV 63 Luckenwalde (8)##N/E##de##GER##1-0##1-0##H/T##Germany##1599067800######2##99######lokomotive-leipzig-vs-fsv-63-luckenwalde/02-09-2020##germany##regionalliga-nordost##5##LEAGUE##2020-2021##round-4##1####0##0##############0##0 2367153-1##194837##0##2020-09-02 17:30:00##HT##1##18022##4189##Club Friendly##0##Real Betis##Almeria##CLB##un##FG##1-0##1-0##H/T##Friendly Games##1599067800######2##99######real-betis-vs-almeria/02-09-2020##friendly-games##club-friendly##2##LEAGUE##2020##round-1##0####0##0##1.53##6.00##3.50########0##0 2313051-1##195400##0##2020-09-02 17:30:00##48##15##43773##103469##1. Deild##0##Magni (12)##Afturelding (8)##D2##is##ISL##2-0##2-0##2 HF##Iceland##1599067800######2##99######magni-vs-afturelding/29-07-2020##iceland##1-deild##2##LEAGUE##2020##round-15##1####0##1599067920##############0##0 2366633-1##194837##0##2020-09-02 17:30:00##47##1##18052##28704##Club Friendly##0##Levante##Cartagena##CLB##un##FG##1-1##1-1##2 HF##Friendly Games##1599067800######2##99######levante-vs-cartagena/02-09-2020##friendly-games##club-friendly##2##LEAGUE##2020##round-1##0####0##1599067980##1.40##5.25##4.20########0##1 2313052-1##195400##0##2020-09-02 17:30:00##47##15##52987##30414##1. Deild##0##Vestri (7)##Thor Akureyri (5)##D2##is##ISL##1-0##1-0##2 HF##Iceland##1599067800######2##99######vestri-vs-thor-akureyri/29-07-2020##iceland##1-deild##2##LEAGUE##2020##round-15##1####0##1599067980##############0##0 2313056-1##195400##0##2020-09-02 17:30:00##47##15##26363##32547##1. Deild##0##IBV Vestmannaeyjar (3)##Leiknir R. (4)##D2##is##ISL##0-2##0-2##2 HF##Iceland##1599067800######2##99######ibv-vestmannaeyjar-vs-leiknir-r/01-08-2020##iceland##1-deild##2##LEAGUE##2020##round-15##1####0##1599067980##############0##0 2363441-1##194837##0##2020-09-02 18:00:00##36##1##21281##3220##Club Friendly##0##Benfica##SC Braga##CLB##un##FG##0-0##-##1 HF##Friendly Games##1599069600######2##99######benfica-vs-sc-braga/02-09-2020##friendly-games##club-friendly##2##LEAGUE##2020##round-1##0####0##1599069540##1.55##5.50##3.50########0##0 2289461-1##193678##0##2020-09-02 18:30:00##4##24##40009##40019##Premier League##0##Smouha SC (6)##El Entag El Harby (14)##PL##eg##EGY##0-0##-##1 HF##Egypt##1599071400######2##99######smouha-sc-vs-el-entag-el-harby/02-03-2020##egypt##premier-league##1##LEAGUE##2019-2020##round-24##1####0##1599071460##2.15##4.00##2.70########0##0 2211667-1##190057##0##2020-09-02 18:30:00##1##1##23376##19092##U21 Championship - Qualifying Group Stage##0##San Marino U21 (6)##Czech Republic U21 (1)##QR##eu##UEF##0-0##-##1 HF##Europe (UEFA)##1599071400######2##8######san-marino-u21-vs-czech-republic-u21/02-09-2020##uefa##qualifying-group-stage##8##PHASE##2021-hungary-slovenia##round-1##1##u21-championship##1##1599071640##29.00##1.01##21.00########0##0 ^^##a##1599071731##1599071658^^~~##5333322-1##197760##0-1##Set 2##2##40089##32214##Pavic/Soares B.##Granollers-P M./Zeballos H. (5)##US OPEN##us##ATP##3-6|2-2|-|-|-##ATP Doubles##US Open##1599068700####H##atp-doubles##us-open##2020######195167######0####13##R32##3########################Set2##-2##z##1##- 5333324-1##197760##0-1##Set 2##2##41811##51180##Bambridge L./McLachlan B.##Eubanks C./Mcdonald M. (wc)##US OPEN##us##ATP##3-6|2-5|-|-|-##ATP Doubles##US Open##1599067500####A##atp-doubles##us-open##2020######195167######0####16##R32##3########################Set2##-2##z##0##- 5333325-1##197760##1-1##Set 3##2##27573##39167##Chardy J./Martin F.##Harrison C./Harrison R. (wc)##US OPEN##us##ATP##7-5|65-77|0-0|-|-##ATP Doubles##US Open##1599066000####H##atp-doubles##us-open##2020######195167######0####25##R32##3########################Set3##-2##z##1##30-30 5333330-1##197760##1-0##Set 2##2##143786##21596##Gille S./Vliegen J.##Kubot L./Melo M. (2)##US OPEN##us##ATP##6-2|4-3|-|-|-##ATP Doubles##US Open##1599067500####H##atp-doubles##us-open##2020######195167######0####15##R32##3########################Set2##-2##z##1##30-15 5334089-1##197805##1-1##Set 3##2##145891##145017##Carlos Alcaraz (se)##Juan Pablo Ficovich####it##CHM##4-6|6-3|5-4|-|-##Challenger Men Singles##Cordenons (Italy)##1599063000####H##challenger-men-singles##cordenons##2020##es##ar##195223##ESP##ARG######28##R32##5########################Set3##10##z##1##- 5334294-1##197761##0-1##Set 2##2##46027##43093##Gerasimov E.##Thompson J.##US OPEN##us##ATP##1-6|3-5|-|-|-##ATP Singles##US Open##1599067500####H##atp-singles##us-open##2020##by##au##195166##BLR##AUS##0####15##R64##1######gerasimov-e##thompson-j################Set2##-2##z##1##40-30 5334313-1##197761##0-0##Set 1##2##58519##52671##Davidovich Fokina A.##Hurkacz H. (24)##US OPEN##us##ATP##0-0|-|-|-|-##ATP Singles##US Open##1599071700####A##atp-singles##us-open##2020##es##pl##195166##ESP##POL##0####0##R64##1######davidovich-fokina-a##hurkacz-h################Set1##-2##z##1##40-15 5334316-1##197761##0-0##Set 1##2##21374##41813##Djokovic N. (1)##Edmund K.##US OPEN##us##ATP##1-2|-|-|-|-##ATP Singles##US Open##1599070500####H##atp-singles##us-open##2020##rs##gb-eng##195166##SRB##ENG##0####3##R64##1######novak-djokovic##edmund-k################Set1##-2##z##1##- 5334317-1##197761##0-1##Set 2##2##143584##44241##Nakashima B. (wc)##Zverev A. (5)##US OPEN##us##ATP##5-7|4-3|-|-|-##ATP Singles##US Open##1599066600####A##atp-singles##us-open##2020##us##de##195166##USA##GER##0####19##R64##1######nakashima-b##alexander-zverev################Set2##-2##z##1##15-40 5334319-1##197761##0-0##Set 1##2##55929##38842##Harris Ll.##Goffin D. (7)##US OPEN##us##ATP##4-3|-|-|-|-##ATP Singles##US Open##1599069300####A##atp-singles##us-open##2020##za##be##195166##RSA##BEL##0####7##R64##1######harris-ll##david-goffin################Set1##-2##z##1##30-15 5334322-1##197761##0-0##Set 1##2##32375##38073##Mannarino A. (32)##Sock J. (pr)##US OPEN##us##ATP##1-2|-|-|-|-##ATP Singles##US Open##1599070800####H##atp-singles##us-open##2020##fr##us##195166##FRA##USA##0####3##R64##1######adrian-mannarino##jack-sock################Set1##-2##z##1##30-15 5334325-1##197761##2-0##Set 3##2##31424##42596##Kukushkin M.##Garin C. (13)##US OPEN##us##ATP##6-2|6-1|2-5|-|-##ATP Singles##US Open##1599065100####H##atp-singles##us-open##2020##kz##cl##195166##KAZ##CHI##0####22##R64##1######mikhail-kukushkin##garin-c################Set3##-2##z##1##40-15 5334328-1##197761##0-0##Set 1##2##51475##43105##Mmoh M. (wc)##Struff J-L. (28)##US OPEN##us##ATP##2-5|-|-|-|-##ATP Singles##US Open##1599070200####A##atp-singles##us-open##2020##us##de##195166##USA##GER##0####7##R64##1######mmoh-m##jan-lennard-struff################Set1##-2##z##1##- 5334329-1##197763##0-1##Set 2##2##4337##41349##Flipkens K.##Pegula J. (wc)##US##us##WTA##61-77|0-0|-|-|-##WTA Singles##US Open##1599068100####H##wta-singles##us-open##2020##be##us##195168##BEL##USA##0####13##R64##2######kirsten-flipkens##pegula-j################Set2##10##z##1##15-0 ^^##a##1599071731##0^^~~##5320202-1##197316##68-62##Q4##2##47955##47954##TBV Start Lublin##Polski Cukier Torun##PLK-RS##pl##POL##21-16|15-15|18-21|14-10| - |36-31##Poland##Energa Basket Liga##1599066000######poland##energa-basket-liga##2020-2021######197315######1####1.21##4.25##########4Qrt##1##z##0 ^^##a##1599071661##0^^~~##5333286-3##197776##1-0##Set 2##2##8410##8414##Spor Toto (1)##Ziraat Bankasi (2)##GS##tr##TUR##25-18|8-5|-|-|-##Turkey##Turkish Cup - Group Stage##1599069600######turkey##national-cup##2020-2021######197447######1####56##############2S##3##z##0 ^^##a##1599071674##1599071126^^~~##5302817-3##196725##28-21##2H##2##140460##41158##Molde W##Larvik W##RS##no##NOR##13-9##Norway##REMA 1000-ligaen - Women##1599066900######norway##postenligaen-women##2020-2021######196717######1####49##1.12##7.50##12.00########2H##2##z##0 5303101-3##196762##21-13##2H##2##8559##3172##Sonderjyske##Skjern##RS##dk##DEN##16-10##Denmark##Handbold Liagen##1599067800######denmark##handball-league##2020-2021######196756######1####34##3.40##1.55##8.50########2H##1##z##0 5303102-3##196762##1-0##1H##2##3517##3516##Skanderborg##Arhus GF##RS##dk##DEN##1-0##Denmark##Handbold Liagen##1599071400######denmark##handball-league##2020-2021######196756######1##1H##1##1.35##4.50##9.50########1H##1##z##0 5304740-3##196776##25-16##2H##2##3587##6865##Kadetten Schaffhausen##Amicitia Zurich##RS##ch##SUI##10-9##Switzerland##NLA##1599066000######switzerland##nla##2020-2021######196774######1####41##1.11##8.00##12.00########2H##1##z##0 5304741-3##196776##12-11##2H##2##10782##10780##HC Kriens##Wacker Thun##RS##ch##SUI##11-10##Switzerland##NLA##1599067800######switzerland##nla##2020-2021######196774######1##H##23##1.50##3.20##8.00########2H##1##z##0 5304742-3##196776##22-18##2H##2##10786##10783##Pfadi Winterthur##Bern Muri##RS##ch##SUI##19-13##Switzerland##NLA##1599067800######switzerland##nla##2020-2021######196774######1##A##40##1.25##5.00##10.00########2H##1##z##0 5304743-3##196776##15-15##2H##2##10777##10784##St. Otmar St. Gallen##Suhr Aarau##RS##ch##SUI##14-15##Switzerland##NLA##1599067800######switzerland##nla##2020-2021######196774######1####30##2.00##2.15##7.50########2H##1##z##0 5304744-3##196776##7-7##1H##2##12340##10778##Endingen##1879 Basel##RS##ch##SUI##7-7##Switzerland##NLA##1599069600######switzerland##nla##2020-2021######196774######1####14##1.67##2.60##7.50########1H##1##z##0 5312581-3##197132##11-17##HT##2##41099##3282##Oroshazi##Pick Szeged##RS##hu##HUN##11-17##Hungary##Liga 1##1599068700######hungary##liga-1##2020-2021######197130######1####28##67.00##1.00##50.00########H/T##1##z##0 5334268-3##197814##2-3##1H##2##9591##9590##Fivers WAT Margareten##Alpla Hard##CUP##at##AUT##2-3##Austria##Super Cup - Cup##1599070800######austria##super-cup##2020-2021######197234######0##H##5##1.67##2.60##7.50########1H##5##z##0 ^^##a##1599071731##1599071728^^~~##5321546-1##197388##2-2##P3##2##22308##22300##HC CSKA Moscow##AK Bars Kazan##KHL-RS##ru##RUS##1-0|0-1|1-1| - | - ##Russia##KHL##1599064200######russia##khl##2020-2021######197387######1####hc-cska-moscow-vs-ak-bars-kazan/02-09-2020##1.67##2.25##########3Per##1##z##1 ^^##a##1599071289##0^^~~##^^##a##1597635989##0^^~~##^^##a##1599057728##0^^~~##^^##a##0##0^^~~##^^##a##1599042879##1590074504^^~~##^^##a##1599045851##0^^~~##^^##a##1599071265##0^^~~##45
I am trying to scrape Craigslist using BeautifulSoup4. All data shows properly EXCEPT price. I can't seem to find the right tagging to loop through pricing instead of showing the same price for each post.
import requests
from bs4 import BeautifulSoup
source = requests.get('https://washingtondc.craigslist.org/search/nva/sss?query=5%20hp%20boat%20motor&sort=rel').text
soup = BeautifulSoup(source, 'lxml')
for summary in soup.find_all('p', class_='result-info'):
pricing = soup.find('span', class_='result-price')
price = pricing
title = summary.a.text
url = summary.a['href']
print(title + '\n' + price.text + '\n' + url + '\n')
Left: HTML code from Craigslist, commented out is irrelevant (in my opinion) code. I want pricing to not loop the same number. Right: Sublime SS of code.
Snippet of code running through terminal. Pricing is the same for each post.
Thank you
Your script is almost correct. You need to change the soup object for the price to summary
import requests
from bs4 import BeautifulSoup
source = requests.get('https://washingtondc.craigslist.org/search/nva/sss?query=5%20hp%20boat%20motor&sort=rel').text
soup = BeautifulSoup(source, 'lxml')
for summary in soup.find_all('p', class_='result-info'):
price = summary.find('span', class_='result-price')
title = summary.a.text
url = summary.a['href']
print(title + '\n' + price.text + '\n' + url + '\n')
Output:
Boat Water Tender - 10 Tri-Hull with Electric Trolling Motor
$629
https://washingtondc.craigslist.org/nva/boa/d/haymarket-boat-water-tender-10-tri-hull/7160572264.html
1987 Boston Whaler Montauk 17
$25450
https://washingtondc.craigslist.org/nva/boa/d/alexandria-1987-boston-whaler-montauk-17/7163033134.html
1971 Westerly Warwick Sailboat
$3900
https://washingtondc.craigslist.org/mld/boa/d/upper-marlboro-1971-westerly-warwick/7170495800.html
Buy or Rent. DC Party Pontoon for Dock Parties or Cruises
$15000
https://washingtondc.craigslist.org/doc/boa/d/washington-buy-or-rent-dc-party-pontoon/7157810378.html
West Marine Zodiac Inflatable Boat SB285 With 5HP Gamefisher (Merc)
$850
https://annapolis.craigslist.org/boa/d/annapolis-west-marine-zodiac-inflatable/7166031908.html
2012 AB aluminum/hypalon inflatable dinghy/2012 Yamaha 6hp four stroke
$3400
https://annapolis.craigslist.org/bpo/d/annapolis-2012-ab-aluminum-hypalon/7157768911.html
RHODES-18’ CENTERBOARD DAYSAILER
$6500
https://annapolis.craigslist.org/boa/d/ocean-view-rhodes-18-centerboard/7148322078.html
Mercury Outboard 7.5 HP
$250
https://baltimore.craigslist.org/bpo/d/middle-river-mercury-outboard-75-hp/7167399866.html
8 hp yamaha 2 stroke
$0
https://baltimore.craigslist.org/bpo/d/8-hp-yamaha-2-stroke/7154103281.html
TRADE 38' BENETEAU IDYLLE 1150
$35000
https://baltimore.craigslist.org/boa/d/middle-river-trade-38-beneteau-idylle/7163761741.html
5-hp Top Tank Mercury
$0
https://baltimore.craigslist.org/bpo/d/5-hp-top-tank-mercury/7154102434.html
5-hp Top Tank Mercury
$0
https://baltimore.craigslist.org/bpo/d/5-hp-top-tank-mercury/7154102744.html
Wanted ur unwanted outboards
$0
https://baltimore.craigslist.org/bpo/d/randallstown-wanted-ur-unwanted/7141349142.html
Grumman Sport Boat
$2250
https://baltimore.craigslist.org/boa/d/baldwin-grumman-sport-boat/7157186381.html
1996 Carver 355 Aft Cabin Motor Yacht
$47000
https://baltimore.craigslist.org/boa/d/middle-river-1996-carver-355-aft-cabin/7156830617.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155566763.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155565771.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155566035.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155565301.html
Cape Dory 25 Sailboat for sale or trade
$6500
https://baltimore.craigslist.org/boa/d/reedville-cape-dory-25-sailboat-for/7149227778.html
West Marine HP-V 350
$1200
https://baltimore.craigslist.org/boa/d/pasadena-west-marine-hp-350/7147285666.html
I am trying to create a webscraper for a website. The problem is that after the collected data is stored in a list, I'm not able to write this to a csv file properly. I have been stuck for ages with this problem and hopefully someone has an idea about how to fix this one!
The loop to get the data from the web pages:
import csv
from htmlrequest import simple_get
from htmlrequest import BeautifulSoup
# Define variables
listData = ['Companies', 'Locations', 'Descriptions']
plus = 15
max = 30
count = 0
# while loop to repeat process till max is reached
while (count <= max):
start = 'https://www.companiesintheuk.co.uk/find?q=Activities+of+sport+clubs&start=' + str(count) + '&s=h&t=SicCodeSearch&location=&sicCode=93120'
raw_html = simple_get(start)
soup = BeautifulSoup(raw_html, 'html.parser')
for i, div in enumerate(soup.find_all('div', class_="search_result_title")):
listData[0] = listData[0].strip() + div.text
for i, div2 in enumerate(soup.find_all('div', class_="searchAddress")):
listData[1] = listData[1].strip() + div2.text
# This is extra information
# for i, div3 in enumerate(soup.find_all('div', class_="searchSicCode")):
# listData[2] = listData[2].strip() + div3.text
count = count + plus
output example if printed:
Companies
(AMG) AGILITY MANAGEMENT GROUP LTD
(KLA) LIONS/LIONESS FOOTBALL TEAMS WORLD CUP LTD
(Dissolved)
1 SPORT ORGANISATION LIMITED
100UK LTD
1066 GYMNASTICS
1066 SPECIALS
10COACHING LIMITED
147 LOUNGE LTD
147 SNOOKER AND POOL CLUB (LEICESTER) LIMITED
Locations
ENGLAND, BH8 9PS
LONDON, EC2M 2PL
ENGLAND, LS7 3JB
ENGLAND, LE2 8FN
UNITED KINGDOM, N18 2QX
AVON, BS5 0JH
UNITED KINGDOM, WC2H 9JQ
UNITED KINGDOM, SE18 5SZ
UNITED KINGDOM, EC1V 2NX
I've tried to get it into a CSV file by using this code but I can't figure out how to properly format my output! Any suggestions are welcome.
# writing to csv
with open('test.csv', 'w') as csvfile:
write = csv.writer(csvfile, delimiter=',')
write.writerow(['Name','Location'])
write.writerow([listData[0],listData[1]])
print("Writing has been done!")
I want the code to be able to format it properly in the csv file to be able to import the two rows in a database.
This is the output when I write the data on 'test.csv'
which will result into this when opened up
The expected outcome would be something like this!
I'm not sure how it is improperly formatted, but maybe you just need to replace with open('test.csv', 'w') with with open('test.csv', 'w+', newline='')
I've combined your code (taking out htmlrequests for requests and bs4 modules and also not using listData, but instead creating my own lists. I've left your lists but they do nothing):
import csv
import bs4
import requests
# Define variables
listData = ['Companies', 'Locations', 'Descriptions']
company_list = []
locations_list = []
plus = 15
max = 30
count = 0
# while loop to repeat process till max is reached
while count <= max:
start = 'https://www.companiesintheuk.co.uk/find?q=Activities+of+sport+clubs&start={}&s=h&t=SicCodeSearch&location=&sicCode=93120'.format(count)
res = requests.get(start)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for i, div in enumerate(soup.find_all('div', class_="search_result_title")):
listData[0] = listData[0].strip() + div.text
company_list.append(div.text.strip())
for i, div2 in enumerate(soup.find_all('div', class_="searchAddress")):
listData[1] = listData[1].strip() + div2.text
locations_list.append(div2.text.strip())
# This is extra information
# for i, div3 in enumerate(soup.find_all('div', class_="searchSicCode")):
# listData[2] = listData[2].strip() + div3.text
count = count + plus
if len(company_list) == len(locations_list):
with open('test.csv', 'w+', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(['Name', 'Location'])
for i in range(len(company_list)):
writer.writerow([company_list[i], locations_list[i]])
Which generates a csv file like:
Name,Location
(AMG) AGILITY MANAGEMENT GROUP LTD,"UNITED KINGDOM, M6 6DE"
"(KLA) LIONS/LIONESS FOOTBALL TEAMS WORLD CUP LTD
(Dissolved)","ENGLAND, BD1 2PX"
0161 STUDIOS LTD,"UNITED KINGDOM, HD6 3AX"
1 CLICK SPORTS MANAGEMENT LIMITED,"ENGLAND, E10 5PW"
1 SPORT ORGANISATION LIMITED,"UNITED KINGDOM, CR2 6NF"
100UK LTD,"UNITED KINGDOM, BN14 9EJ"
1066 GYMNASTICS,"EAST SUSSEX, BN21 4PT"
1066 SPECIALS,"EAST SUSSEX, TN40 1HE"
10COACHING LIMITED,"UNITED KINGDOM, SW6 6LR"
10IS ACADEMY LIMITED,"ENGLAND, PE15 9PS"
"10TH MAN LIMITED
(Dissolved)","GLASGOW, G3 6AN"
12 GAUGE EAST MANCHESTER COMMUNITY MMA LTD,"ENGLAND, OL9 8DQ"
121 MAKING WAVES LIMITED,"TYNE AND WEAR, NE30 1AR"
121 WAVES LTD,"TYNE AND WEAR, NE30 1AR"
1-2-KICK LTD,"ENGLAND, BH8 9PS"
"147 HAVANA LIMITED
(Liquidation)","LONDON, EC2M 2PL"
147 LOUNGE LTD,"ENGLAND, LS7 3JB"
147 SNOOKER AND POOL CLUB (LEICESTER) LIMITED,"ENGLAND, LE2 8FN"
1ACTIVE LTD,"UNITED KINGDOM, N18 2QX"
1ON1 KING LTD,"AVON, BS5 0JH"
1PUTT LTD,"UNITED KINGDOM, WC2H 9JQ"
1ST SPORTS LTD,"UNITED KINGDOM, SE18 5SZ"
2 BRO PRO EVENTS LTD,"UNITED KINGDOM, EC1V 2NX"
2 SPLASH SWIM SCHOOL LTD,"ENGLAND, B36 0EY"
2 STEPPERS C.I.C.,"SURREY, CR0 6BX"
2017 MOTO LIMITED,"UNITED KINGDOM, ME2 4NW"
2020 ARCHERY LTD,"LONDON, SE16 6SS"
21 LEISURE LIMITED,"LONDON, EC4M 7WS"
261 FEARLESS CLUB UNITED KINGDOM CIC,"LANCASHIRE, LA2 8RF"
2AIM4 LIMITED,"HERTFORDSHIRE, SG2 0JD"
2POINT4 FM LTD,"LONDON, NW10 8LW"
3 LIONS SCHOOL OF SPORT LTD,"BRISTOL, BS20 8BU"
3 PT LTD,"ANTRIM, BT40 2FB"
3 PUTT LIFE LTD,"UNITED KINGDOM, LU3 2DP"
3 THIRTY SEVEN LTD,"KENT, DA9 9RS"
3:30 SOCCER SCHOOL LTD,"UNITED KINGDOM, EH6 7JB"
30 MINUTE WORKOUT (LLANISHEN) LTD,"PONTYCLUN, CF72 9UA"
321 RELAX LTD,"MID GLAMORGAN, CF83 3HL"
360 MOTOR RACING CLUB LTD,"HALSTEAD, CO9 2ET"
3LIONSATHLETICS LIMITED,"ENGLAND, S3 8DB"
3S SWIM ROMFORD LTD,"UNITED KINGDOM, DA9 9DR"
3XL EVENT MANAGEMENT LIMITED,"KENT, BR3 4NW"
3XL MOTORSPORT MANAGEMENT LIMITED,"KENT, BR3 4NW"
4 CORNER FOOTBALL LTD,"BROMLEY, BR1 5DD"
4 PRO LTD,"UNITED KINGDOM, FY5 5HT"
Which seems fine to me, but your post was very unclear about how you expected it to be formatted so I really have no idea
URL: http://www.imdb.com/chart/?ref_=nv_ch_cht_2
I want you to print top box office list from above site (all the movies' rank, title, weekend, gross and weeks movies in the order)
Example output:
Rank:1
title: godzilla
weekend:$93.2M
Gross:$93.2M
Weeks: 1
Rank: 2
title: Neighbours
This is just a simple way to extract those entities by BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = "http://www.imdb.com/chart/?ref_=nv_ch_cht_2"
data = urllib2.urlopen(url).read()
page = BeautifulSoup(data, 'html.parser')
rows = page.findAll("tr", {'class': ['odd', 'even']})
for tr in rows:
for data in tr.findAll("td", {'class': ['titleColumn', 'weeksColumn','ratingColumn']}):
print data.get_text()
P.S.-Arrange according to your will.
There is no need to scrape anything. See the answer I gave here.
How to scrape data from imdb business page?
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan Skarsgård, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long