贴一波爬LeetCode每日一题的代码,也没啥技术含量,就嗯套+连连看,以后看看有没有啥更好的方法,以下都是使用#单行注释(Sagemath选手狂喜) # 准备工作
from selenium import webdriver
import time
import re
import sys
time主要是做延迟操作,re引入正则匹配,sys就纯属要不要也差不多 这里主要是用selenium库,据说用pyppeteer更好,咱也没注意2333(主要是没看到教程555) 下面主要说几个注意事项: 1. 配置好python环境,python要写入环境变量(python本身安装包上是未勾选,注意勾选) 1. 安装selenium库,写入环境后在cmd下直接使用命令 pip install selenium
安装。 1. 然后要安装对应的drive,我是用的chrome,要下载对应的chromedrive。这里要注意与chrome本身的版本要一致,这一点网上有资料说明来参考。另外,如果要保持版本,可以停用chrome的更新服务。
爬取网页部分
def GetPageText(url: str) -> str:
browser = webdriver.Chrome()
browser.get(url)
time.sleep(10)
page_text = browser.page_source
idxStart = page_text.find("每日 1 题")
idxEnd = page_text.find("可查看该题目在真实面试中的出现频率")
pageText = page_text[idxStart:idxEnd]
browser.quit()
return pageText
这里需要将chrome驱动放入python库的目录下才能正常启动。由于只是静态数据爬取,操作量也就那样,所以前面只需要简单的访问网页然后记录即可。在返回前先将需要使用的部分网页切片,避免内容过多在数据处理时正则匹配困难。
筛选数据部分
def MatchItems(tempText: str) -> list:
print(tempText)
patternNum1 = re.compile('label="\[object Object]">([0-9]{1,})([<])')
objNum1 = patternNum1.search(tempText)
tempNum = objNum1.group(0)
patternNum2 = re.compile('([0-9]{1,})')
objNum2 = patternNum2.search(tempNum)
num = objNum2.group(0)
print(num)
patternTitle1 = re.compile('data-original-title=(["])([^"]{1,})(["])')
objTitle1 = patternTitle1.search(tempText)
tempTitle = objTitle1.group(0)
patternTitle2 = re.compile('(["])([^"]{1,})(["])')
objTitle2 = patternTitle2.search(tempTitle)
title = objTitle2.group(0)[1:-1]
print(title)
patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
objPassRate = patternPassRate.search(tempText)
passRate = objPassRate.group(0)
print(passRate)
patternLevel = re.compile('level-([a-z]{1,})')
objLevel = patternLevel.search(tempText)
idxLevel = objLevel.group(0).find('-')
level = objLevel.group(0)[idxLevel + 1:]
dicLevel = {}
print(level)
patternSolution1 = re.compile('class="solution-link">([0-9]{1,})([<])')
objSolution1 = patternSolution1.search(tempText)
tempSolution = objSolution1.group(0)
patternSolution2 = re.compile('([0-9]{1,})')
objSolution2 = patternSolution2.search(tempSolution)
solution = objSolution2.group(0)
print(solution)
patternHref1 = re.compile('<a href="([^"]{1,})(["])')
objHref1 = patternHref1.search(tempText)
tempHref = objHref1.group(0)
patternHref2 = re.compile('(["])([^"]{1,})(["])')
objHref2 = patternHref2.search(tempHref)
suffixHref = objHref2.group(0)[1:-1]
fullHref = "https://leetcode-cn.com" + suffixHref
print(fullHref)
infoList = []
infoList.append(num)
infoList.append(title)
infoList.append(level)
infoList.append(passRate)
infoList.append(solution)
infoList.append(fullHref)
return infoList
首先是看LeetCode对应界面的源码(F12看,还有直接右键检查),主要是看网页布局,看看如何使用正则匹配,因为标签较多,要找到合适的参考系(乐了,物理学渣瑟瑟发抖)。 下面基本上是连连看,大部分都是两次正则匹配出结果,大家可以先看看正则匹配如何使用,这其实是非常好用的搜索方式,即使是会使用beautiful soup也最好看看正则应该怎么做。
写文件部分
def WriteFile(path: str) -> bool:
curTime = time.strftime("%Y-%m-%d", time.localtime())
try:
ofile = open(path, "r+")
except IOError:
try:
ofile = open(path, "w+")
except IOError:
return False
else:
pass
else:
fulltext = ofile.read()
if curTime in fulltext:
return True
pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
itemList = MatchItems(pageText)
ofile.write("|")
ofile.write(curTime + "\t")
for i in range(len(itemList)):
ofile.write("|")
ofile.write(itemList[i] + "\t\t")
if i == 1:
ofile.write("\t")
ofile.write("|")
ofile.write("\n")
ofile.close()
return True
这里的try except主要是为了避免文件指针为空(人话:没读到那个文件)后依然进行操作引起程序出错。 做好各种检验工作后正常调用前面两个部分的函数。得到的结果按要求写入文件。这里主要是为了兼顾markdown语法和排版,所以加了制表符和|。最后注意关闭文件!这个操作蛮重要的,上次C++写文件没有close然后各种g,23333
旧版完整代码
贴一下完整代码,他这个代码风格好像对python兼容挺差的,很多高亮没做好,最好是复制到pycharm (什么,为啥要用pycharm,我还有100种python ide)上看。
from selenium import webdriver
import time
import re
import sys
def GetPageText(url: str) -> str:
browser = webdriver.Chrome() #打开内置chrome页面
browser.get(url) #访问传入Url(str类型)
time.sleep(10) #休眠时间,避免数据遗漏,等待浏览器响应
page_text = browser.page_source #调用库函数读入
idxStart = page_text.find("每日 1 题") #读入起始位置
idxEnd = page_text.find("可查看该题目在真实面试中的出现频率") #读入终止位置
pageText = page_text[idxStart:idxEnd] #做文本切片
browser.quit() #关闭浏览器
return pageText #返回切片内容
def MatchItems(tempText: str) -> list:
print(tempText) #检查切片内容
patternNum1 = re.compile('label="\[object Object]">([0-9]{1,})([<])') #利用正则表达式读入标签做粗略筛选
objNum1 = patternNum1.search(tempText) #得到正则匹配对象
tempNum = objNum1.group(0) #获取匹配对象的字符串
patternNum2 = re.compile('([0-9]{1,})') #再次利用正则表达式做精确筛选
objNum2 = patternNum2.search(tempNum)
num = objNum2.group(0)
print(num) #检查匹配值
patternTitle1 = re.compile('data-original-title=(["])([^"]{1,})(["])') #同上述步骤操作
objTitle1 = patternTitle1.search(tempText)
tempTitle = objTitle1.group(0)
patternTitle2 = re.compile('(["])([^"]{1,})(["])')
objTitle2 = patternTitle2.search(tempTitle)
title = objTitle2.group(0)[1:-1]
print(title)
patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
objPassRate = patternPassRate.search(tempText)
passRate = objPassRate.group(0)
print(passRate)
patternLevel = re.compile('level-([a-z]{1,})')
objLevel = patternLevel.search(tempText)
idxLevel = objLevel.group(0).find('-')
level = objLevel.group(0)[idxLevel + 1:]
dicLevel = {}
print(level)
patternSolution1 = re.compile('class="solution-link">([0-9]{1,})([<])')
objSolution1 = patternSolution1.search(tempText)
tempSolution = objSolution1.group(0)
patternSolution2 = re.compile('([0-9]{1,})')
objSolution2 = patternSolution2.search(tempSolution)
solution = objSolution2.group(0)
print(solution)
patternHref1 = re.compile('<a href="([^"]{1,})(["])')
objHref1 = patternHref1.search(tempText)
tempHref = objHref1.group(0)
patternHref2 = re.compile('(["])([^"]{1,})(["])')
objHref2 = patternHref2.search(tempHref)
suffixHref = objHref2.group(0)[1:-1]
fullHref = "https://leetcode-cn.com" + suffixHref #这里需要将爬取的内容加上前缀成为完整的超链接
print(fullHref)
infoList = []
infoList.append(num)
infoList.append(title)
infoList.append(level)
infoList.append(passRate)
infoList.append(solution)
infoList.append(fullHref) #整理回传的列表
return infoList
def WriteFile(path: str) -> bool:
curTime = time.strftime("%Y-%m-%d", time.localtime()) #读入当前时间
try:
ofile = open(path, "r+")
except IOError:
try:
ofile = open(path, "w+")
except IOError:
return False
else:
pass
else:
fulltext = ofile.read() #检测是否能够进行文件IO,若成功读入文件则进行后续步骤
if curTime in fulltext: #若当前日期已经写入,则退出程序
return True
pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
itemList = MatchItems(pageText) #先后调用爬取网页和数据处理函数
ofile.write("|")
ofile.write(curTime + "\t")
for i in range(len(itemList)):
ofile.write("|")
ofile.write(itemList[i] + "\t\t")
if i == 1:
ofile.write("\t")
ofile.write("|")
ofile.write("\n") #控制打印格式
ofile.close() #正常关闭文件
return True
isRunning = WriteFile("C:\\leetcode\\leetcode.txt") #传入文件地址
sys.exit(not isRunning)
新版完整代码
由于力扣官方改版,对网页布局调整,原先的代码寄了,贴一下现在的代码,整体结构类似,仅修改了匹配部分,注释参考旧版
from selenium import webdriver
import time
import re
import sys
def GetPageText(url: str) -> str:
browser = webdriver.Chrome()
browser.get(url)
time.sleep(10)
page_text = browser.page_source
idxStart = page_text.find("tr data-row-key")
idxEnd = page_text.find('tr data-row-key="1.')
pageText = page_text[idxStart:idxEnd]
browser.quit()
return pageText
def MatchItems(tempText: str) -> list:
print(tempText)
patternNum1 = re.compile('tr data-row-key=(["])([^0-9]{0,})([0-9]{1,})([.])')
objNum1 = patternNum1.search(tempText)
tempNum = objNum1.group(0)
patternNum2 = re.compile('(["])([^0-9]{0,})([0-9]{1,})')
objNum2 = patternNum2.search(tempNum)
num = objNum2.group(0)
num = num[1:]
print(num)
patternTitle1 = re.compile('class="h-5 truncate hover:text-primary-s dark:hover:text-dark-primary-s">([^0-9]{0,}[0-9]{1,})([.])([^>]{1,})([>])')
objTitle1 = patternTitle1.search(tempText)
tempTitle = objTitle1.group(0)
patternTitle2 = re.compile('([.])([^<]{1,})([<])')
objTitle2 = patternTitle2.search(tempTitle)
title = objTitle2.group(0)[1:-1]
print(title)
patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
objPassRate = patternPassRate.search(tempText)
passRate = objPassRate.group(0)
print(passRate)
levelArray = ["简单", "中等", "困难"]
level = ""
for testlevel in levelArray:
if testlevel in tempText:
level = testlevel
print(level)
patternSolution1 = re.compile('class="truncate">([0-9]{1,})([<])')
objSolution1 = patternSolution1.search(tempText)
tempSolution = objSolution1.group(0)
patternSolution2 = re.compile('([0-9]{1,})')
objSolution2 = patternSolution2.search(tempSolution)
solution = objSolution2.group(0)
print(solution)
patternHref1 = re.compile('<a href="([^"]{1,})(["])')
objHref1 = patternHref1.search(tempText)
tempHref = objHref1.group(0)
patternHref2 = re.compile('(["])([^"]{1,})(["])')
objHref2 = patternHref2.search(tempHref)
suffixHref = objHref2.group(0)[1:-1]
fullHref = "https://leetcode-cn.com" + suffixHref
print(fullHref)
infoList = []
infoList.append(num)
infoList.append(title)
infoList.append(level)
infoList.append(passRate)
infoList.append(solution)
infoList.append(fullHref)
return infoList
def WriteFile(path: str) -> bool:
curTime = time.strftime("%Y-%m-%d", time.localtime())
try:
ofile = open(path, "r+")
except IOError:
try:
ofile = open(path, "w+")
except IOError:
return False
else:
pass
else:
fulltext = ofile.read()
if curTime in fulltext:
return True
pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
itemList = MatchItems(pageText)
ofile.write("|")
ofile.write(curTime + "\t")
for i in range(len(itemList)):
ofile.write("|")
ofile.write(itemList[i] + "\t\t")
if i == 1:
ofile.write("\t")
ofile.write("|")
ofile.write("\n")
ofile.close()
return True
isRunning = WriteFile("C:\\leetcode\\leetcode.txt")
sys.exit(not isRunning)