Webscraping news articles by using selenium

cate16 · Aug-23-2023, 09:21 AM

Hello, I have to do webscraping of some articles from a website (pressreader).

My code is the following:

from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service



import pyautogui
import os.path

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



def starttoend(start, end, year, month, day):
    s_year = start[0:4]
    s_mon = start[4:6]
    s_day = start[6:8]
    e_year = end[0:4]
    e_mon = end[4:6]
    e_day = end[6:8]
    ret = []
    for i in range(year.index(s_year), year.index(e_year) + 1):
        for j in range(month.index(s_mon), month.index(e_mon) + 1):
            if i == year.index(s_year) and j == month.index(s_mon):
                for k in range(day.index(s_day), 31):
                    ret.append(year[i] + month[j] + day[k])
            elif i == year.index(e_year) and j == month.index(e_mon):
                for k in range(0, day.index(e_day) + 1):
                    ret.append(year[i] + month[j] + day[k])
            else:
                for k in range(31):
                    ret.append(year[i] + month[j] + day[k])
    return ret


# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
        "2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
        "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

# set up to save print as PDF file
settings = {
    "appState": {
        "recentDestinations": [{
            "id": "Save as PDF",
            "origin": "local"
        }],
        "selectedDestinationId": "Save as PDF",
        "version": 2
    }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

service = Service(executable_path=r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#driver = webdriver.Chrome(service = service)


# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(service = service,        
                          options=chrome_options)

# traverse through all papers
for i in range(len(papernames)):
    # traverse through dates
    for j in dates[i]:
        count = 1
        dobreak = False
        for k in index:
            if (dobreak):
                break

            try:

                driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
                actions1 = webdriver.common.action_chains.ActionChains(driver)
                actions2 = webdriver.common.action_chains.ActionChains(driver)

                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

                bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

                bottom_button.click()

                time.sleep(2)

                all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
                all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

                news = all_news[k]
                first = True

                article_id = news.get_attribute("article-id")
                print(article_id)
                actions1.move_to_element(news).perform()
                news.click()

                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
                time.sleep(2)
                arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
                head = arti.find_element_by_tag_name("hgroup")
                time.sleep(1)
                actions2.move_to_element(head).perform()
                time.sleep(1)
                actions2.context_click(head).perform()

                time.sleep(2)
                printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
                printbutton.click()

                time.sleep(1)

                printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
                printtext.click()

                time.sleep(4)
                name = ""
                if (count < 10):
                    name = papernames[i] + "_" + j + "_" + "0" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
                else:
                    name = papernames[i] + "_" + j + "_" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))

                time.sleep(1)
                pyautogui.press('enter')
                print("saved" + name)

                time.sleep(10)

                count += 1
                cont_fail = 0
                if k == len(all_news) - 1:
                    driver.quit()
                    dobreak = True
                    break
                driver.quit()
                time.sleep(1)


            except:

                cont_fail += 1

                print("failed on" + papernames[i] + j + str(k))

                driver.quit()

            if cont_fail > 5:
                break

            continue

I keep getting this error:

C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 96, in <module>
    driver = webdriver.Chrome(service = service,        
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1

I have installed the chrome driver path in the system, already tried the old version of the selenium package (since I think that the original code (link: https://github.com/asui1/Webautomation/b...%20test.py) might be using an older version of selenium) but it still doesn't work.
I am new to python and I still have a lot to learn, can someone help me? Thank you in advance

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Webscraping with beautifulsoup	cormanstan	3	2,317	Aug-24-2023, 11:57 AM Last Post: snippsat
	Webscraping returning empty table	Buuuwq	0	1,516	Dec-09-2022, 10:41 AM Last Post: Buuuwq
	WebScraping using Selenium library	Korgik	0	1,137	Dec-09-2022, 09:51 AM Last Post: Korgik
	How to get rid of numerical tokens in output (webscraping issue)?	jps2020	0	2,019	Oct-26-2020, 05:37 PM Last Post: jps2020
	Python Webscraping with a Login Website	warriordazza	0	2,717	Jun-07-2020, 07:04 AM Last Post: warriordazza
	Help with basic webscraping	Captain_Snuggle	2	4,081	Nov-07-2019, 08:07 PM Last Post: kozaizsvemira
	Can't Resolve Webscraping AttributeError	Hass	1	2,395	Jan-15-2019, 09:36 PM Last Post: nilamo
	How to exclude certain links while webscraping basis on keywords	Prince_Bhatia	0	3,328	Oct-31-2018, 07:00 AM Last Post: Prince_Bhatia
	Webscraping homework	Ghigo1995	1	2,730	Sep-23-2018, 07:36 PM Last Post: nilamo
	Intro to WebScraping	d1rjr03	2	3,544	Aug-15-2018, 12:05 AM Last Post: metulburr

Webscraping news articles by using selenium

User Panel Messages

Announcements