import requests
from bs4 import BeautifulSoup
import pandas as pd
urlHead = 'https://www.run2pix.com/report/report_w.php?EventCode='
eventCode =[[['20171217', 'MA', '161', 12], ['20171217', 'HM', '162', 32]],
[['20161218', 'MA', '136', 12], ['20161218', 'HM', '137', 28]],
[['20151220', 'MA', '111', 10], ['20151220', 'HM', '112', 20]],
[['20141221', 'MA', '86', 11], ['20141221', 'HM', '87', 34]],
[['20131215', 'MA', '57', 10],['20131215','HM', '58', 31]]]
yearCheck = {2017: eventCode[0], 2016: eventCode[1], 2015: eventCode[2], 2014: eventCode[3], 2013: eventCode[4]}
pageRunnerNum = 500
def checkTime(x):
h,m,s = map(int, x.split(":"))
hr,mi,se =map(str, x.split(":"))
if m == 60:
h=h+1
hr='0'+str(h)
mi='00'
return hr+":"+mi+":"+se
else: return x
def secondSelect(gender):
if runner[3][1]==u'\u570b':
runnerInfo = [y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, 'Invited']
elif runner[3][1:3]=='19':
runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, 'Under 20']
elif runner[3][1]=='6':
runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, '60+']
elif runner[3][1]=='7':
runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, '60+']
elif runner[3][1:3]==u'\u5b50\u7d44':
runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, 'HM']
elif runner[3][1]==u'\u8996':
runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, 'Visually Impaired']
else: runnerInfo =[y, yearCheck[y][case][1], runner[5],runner[4],runner[7], gender, runner[3][1:6]]
return runnerInfo
def groupSelect(s):
if s == u'\u5973':
secondSelect('F')
else: secondSelect('M')
return runnerInfo
multiRunner_info=[]
finishTime=[]
for y in range(2013, 2018):
for case in range(0,2):
pageCount = yearCheck[y][case][3]
for k in range(0, pageCount):
url = urlHead+ yearCheck[y][case][0]+"&Race="+yearCheck[y][case][1] + "&sn=" +yearCheck[y][case][2] + "&pagenum=" + str(k+1)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for i in range(0, pageRunnerNum):
runnerTable = soup.findAll('table')[1].findAll('tr')[i+11]
if (len(runnerTable)<4):
break
runner = [td.getText(strip=True) for td in runnerTable.findAll('td')]
runner[4]=checkTime(runner[4])
runner[7]=checkTime(runner[7])
## ---- Runner[3][0] represents MALE or FEMALE maleGroup or femaleGroup ----
runnerInfo = groupSelect(runner[3][0])
multiRunner_info.insert((k*500+i), runnerInfo)
df = pd.DataFrame(multiRunner_info, columns=['Year', 'Full_half', 'Rank', 'Official_Time' ,'Net_Time', 'Gender','Catagory'])
df.to_csv('allTPE_Runner2013_2017.csv', encoding='utf-8-sig')