Python Forum
Webscraping news articles by using selenium
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Webscraping news articles by using selenium
#1
Hello, I have to do webscraping of some articles from a website (pressreader).

My code is the following:

from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service



import pyautogui
import os.path

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



def starttoend(start, end, year, month, day):
    s_year = start[0:4]
    s_mon = start[4:6]
    s_day = start[6:8]
    e_year = end[0:4]
    e_mon = end[4:6]
    e_day = end[6:8]
    ret = []
    for i in range(year.index(s_year), year.index(e_year) + 1):
        for j in range(month.index(s_mon), month.index(e_mon) + 1):
            if i == year.index(s_year) and j == month.index(s_mon):
                for k in range(day.index(s_day), 31):
                    ret.append(year[i] + month[j] + day[k])
            elif i == year.index(e_year) and j == month.index(e_mon):
                for k in range(0, day.index(e_day) + 1):
                    ret.append(year[i] + month[j] + day[k])
            else:
                for k in range(31):
                    ret.append(year[i] + month[j] + day[k])
    return ret


# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
        "2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
        "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

# set up to save print as PDF file
settings = {
    "appState": {
        "recentDestinations": [{
            "id": "Save as PDF",
            "origin": "local"
        }],
        "selectedDestinationId": "Save as PDF",
        "version": 2
    }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

service = Service(executable_path=r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#driver = webdriver.Chrome(service = service)


# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(service = service,        
                          options=chrome_options)

# traverse through all papers
for i in range(len(papernames)):
    # traverse through dates
    for j in dates[i]:
        count = 1
        dobreak = False
        for k in index:
            if (dobreak):
                break

            try:

                driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
                actions1 = webdriver.common.action_chains.ActionChains(driver)
                actions2 = webdriver.common.action_chains.ActionChains(driver)

                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

                bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

                bottom_button.click()

                time.sleep(2)

                all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
                all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

                news = all_news[k]
                first = True

                article_id = news.get_attribute("article-id")
                print(article_id)
                actions1.move_to_element(news).perform()
                news.click()

                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
                time.sleep(2)
                arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
                head = arti.find_element_by_tag_name("hgroup")
                time.sleep(1)
                actions2.move_to_element(head).perform()
                time.sleep(1)
                actions2.context_click(head).perform()

                time.sleep(2)
                printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
                printbutton.click()

                time.sleep(1)

                printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
                printtext.click()

                time.sleep(4)
                name = ""
                if (count < 10):
                    name = papernames[i] + "_" + j + "_" + "0" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
                else:
                    name = papernames[i] + "_" + j + "_" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))

                time.sleep(1)
                pyautogui.press('enter')
                print("saved" + name)

                time.sleep(10)

                count += 1
                cont_fail = 0
                if k == len(all_news) - 1:
                    driver.quit()
                    dobreak = True
                    break
                driver.quit()
                time.sleep(1)


            except:

                cont_fail += 1

                print("failed on" + papernames[i] + j + str(k))

                driver.quit()

            if cont_fail > 5:
                break

            continue
I keep getting this error:
C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 96, in <module>
    driver = webdriver.Chrome(service = service,        
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1
I have installed the chrome driver path in the system, already tried the old version of the selenium package (since I think that the original code (link: https://github.com/asui1/Webautomation/b...%20test.py) might be using an older version of selenium) but it still doesn't work.
I am new to python and I still have a lot to learn, can someone help me? Thank you in advance
Reply


Messages In This Thread
Webscraping news articles by using selenium - by cate16 - Aug-23-2023, 09:21 AM

Possibly Related Threads…
Thread Author Replies Views Last Post
  Webscraping with beautifulsoup cormanstan 3 2,317 Aug-24-2023, 11:57 AM
Last Post: snippsat
  Webscraping returning empty table Buuuwq 0 1,516 Dec-09-2022, 10:41 AM
Last Post: Buuuwq
  WebScraping using Selenium library Korgik 0 1,137 Dec-09-2022, 09:51 AM
Last Post: Korgik
  How to get rid of numerical tokens in output (webscraping issue)? jps2020 0 2,019 Oct-26-2020, 05:37 PM
Last Post: jps2020
  Python Webscraping with a Login Website warriordazza 0 2,717 Jun-07-2020, 07:04 AM
Last Post: warriordazza
  Help with basic webscraping Captain_Snuggle 2 4,081 Nov-07-2019, 08:07 PM
Last Post: kozaizsvemira
  Can't Resolve Webscraping AttributeError Hass 1 2,395 Jan-15-2019, 09:36 PM
Last Post: nilamo
  How to exclude certain links while webscraping basis on keywords Prince_Bhatia 0 3,328 Oct-31-2018, 07:00 AM
Last Post: Prince_Bhatia
  Webscraping homework Ghigo1995 1 2,730 Sep-23-2018, 07:36 PM
Last Post: nilamo
  Intro to WebScraping d1rjr03 2 3,544 Aug-15-2018, 12:05 AM
Last Post: metulburr

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020