(Mar-17-2019, 10:18 AM)metulburr Wrote: based on this
Quote:GenericErrorMessageCookies="Cookies must be enabled in order to view this page.";I would suggest that your program is not using cookies.
(Mar-17-2019, 08:16 AM)yeungcase Wrote: I did check my browser setting and cookies are enabled.Your browser settings has nothing to do with requests in python as that is what is sending the request data. Are you sending the cookie via requests module?
Show us your code.
from urllib.request import urlopen from bs4 import BeautifulSoup import requests import pandas as pd import xlsxwriter import re import os ## Scrapping all racing day on the site race_day_place = 'HV' race_day_url='https://racing.hkjc.com/racing/info/meeting/Results/English/Local/' race_day_url_content = requests.get(race_day_url) race_day_url_content.encoding = 'utf-8' race_day_url_html_content = race_day_url_content.text race_day_soup = BeautifulSoup(race_day_url_html_content, 'lxml') race_day_soup2 = race_day_soup.find('div', class_="rowDiv5") race_day = race_day_soup2.find('td', class_="tdAlignR") options = race_day.find_all("option", {'value':re.compile('^Local')} ) raceday = options[1:] jc_raceday_list = [] for each in raceday: value = each.text jc_raceday_list.append(value) ## Scrapping all racing day in my folder jay_raceday = os.listdir('C://AnyDirectory') jay_raceday2 = [] for eachfile in jay_raceday: os.path.splitext(eachfile)[0] jay_raceday2.append(eachfile[0:10]) jay_raceday3 = [d[8:10]+"/"+d[5:7]+"/"+d[:4] for d in jay_raceday2] ## Identify the difference above and append it in a list daydeviation = [] for day in jc_raceday_list: if day not in jay_raceday3: daydeviation.append(day) ## Convert into appropriate format for each_deviation in daydeviation: each_deviation = [d[6:10]+d[3:5]+d[0:2] for d in daydeviation] ## Looping all missing racing day for deviation in each_deviation: ## Scrapping entries data booklet_name = deviation[0:4]+'-'+deviation[4:6]+'-'+deviation[6:9] entries_race_place = 'HV' entries_url = 'http://racing.hkjc.com/racing/info/meeting/Entries/English/Local/'+deviation+'/'+entries_race_place entries_request = requests.get(entries_url) entries_request.encoding = 'utf-8' entries_request_html_content = entries_request.text entries_soup = BeautifulSoup(entries_request_html_content, 'lxml') entries_table = entries_soup.find('table', class_='col_12') if entries_table is None: entries_race_place = 'ST' entries_url = 'http://racing.hkjc.com/racing/info/meeting/Entries/English/Local/'+deviation+'/'+entries_race_place entries_request = requests.get(entries_url) entries_request.encoding = 'utf-8' entries_request_html_content = entries_request.text entries_soup = BeautifulSoup(entries_request_html_content, 'lxml') entries_table = entries_soup.find('table', class_='col_12') if entries_table: entries_trs = entries_table.find_all('tr') entries_content = [] for entries_tr in entries_trs[6:]: for entries_td2 in entries_tr.find_all('td', {'class': ['alignL2', 'alignL2-grey']}): entries_content.append(entries_td2.text.strip('\n\r\t": ')) writer = pd.ExcelWriter('C:\\AnyDirectory\\'+booklet_name+'.xlsx', engine='xlsxwriter') ## Scrapping all the result for page in range (1,13): result_race_place = 'HV' result_url = 'http://racing.hkjc.com/racing/info/meeting/Results/English/Local/'+deviation+'/'+result_race_place+'/'+str(page) result_request = requests.get(result_url) result_request.encoding = 'utf-8' result_html_content = result_request.text result_soup = BeautifulSoup(result_html_content, 'lxml') result_table = result_soup.find('table', class_='tableBorder trBgBlue tdAlignC number12 draggable') if result_table is None: result_race_place = 'ST' result_url = 'http://racing.hkjc.com/racing/info/meeting/Results/English/Local/'+deviation+'/'+result_race_place+'/'+str(page) result_request = requests.get(result_url) result_request.encoding = 'utf-8' result_html_content = result_request.text result_soup = BeautifulSoup(result_html_content, 'lxml') result_table = result_soup.find('table', class_='tableBorder trBgBlue tdAlignC number12 draggable') if result_table: hds = result_soup.find('thead') if hds: headers = [] for hds_td in hds.find_all('td'): headers.append(hds_td.text.strip('\n\r\t": ')) headers += ['Ace'] result_content = [] result_row = [] result_trs = result_table.find_all('tr', {'class': ['trBgGrey', 'trBgWhite']}) for result_tr in result_trs: result_tds = result_tr.find_all('td', {'nowrap': 'nowrap'}) for result_td in result_tds: result_row.append(result_td.text.strip('\n\r\t": ')) result_content.append(result_row) result_row = [] for each_result in result_content: new_result = each_result[2].split(sep='(')[0] for that in entries_content: if new_result in that and ('+' or '*' or '#') in that: answer = that.split(sep=new_result)[1][1] if answer.isdigit(): ace = '-' else: ace = answer each_result.append(ace) elif new_result in that and ('+' or '*' or '#') not in that: ace = '-' each_result.append(ace) if len(each_result) > 13: del each_result[-1] df = pd.DataFrame(result_content, columns=headers) df.to_excel(writer, sheet_name='Race'+str(page)) else: continue