爬取每日一题代码分析


贴一波爬LeetCode每日一题的代码,也没啥技术含量,就嗯套+连连看,以后看看有没有啥更好的方法,以下都是使用#单行注释(Sagemath选手狂喜) # 准备工作

from selenium import webdriver
import time
import re
import sys
time主要是做延迟操作,re引入正则匹配,sys就纯属要不要也差不多 这里主要是用selenium库,据说用pyppeteer更好,咱也没注意2333(主要是没看到教程555) 下面主要说几个注意事项: 1. 配置好python环境,python要写入环境变量(python本身安装包上是未勾选,注意勾选) 1. 安装selenium库,写入环境后在cmd下直接使用命令 pip install selenium 安装。 1. 然后要安装对应的drive,我是用的chrome,要下载对应的chromedrive。这里要注意与chrome本身的版本要一致,这一点网上有资料说明来参考。另外,如果要保持版本,可以停用chrome的更新服务。


爬取网页部分

def GetPageText(url: str) -> str:
    browser = webdriver.Chrome()    
    browser.get(url)
    time.sleep(10)
    page_text = browser.page_source
    idxStart = page_text.find("每日 1 题")
    idxEnd = page_text.find("可查看该题目在真实面试中的出现频率")
    pageText = page_text[idxStart:idxEnd]
    browser.quit()
    return pageText

这里需要将chrome驱动放入python库的目录下才能正常启动。由于只是静态数据爬取,操作量也就那样,所以前面只需要简单的访问网页然后记录即可。在返回前先将需要使用的部分网页切片,避免内容过多在数据处理时正则匹配困难。


筛选数据部分

def MatchItems(tempText: str) -> list:
    print(tempText)

    patternNum1 = re.compile('label="\[object Object]">([0-9]{1,})([<])')
    objNum1 = patternNum1.search(tempText)
    tempNum = objNum1.group(0)
    patternNum2 = re.compile('([0-9]{1,})')
    objNum2 = patternNum2.search(tempNum)
    num = objNum2.group(0)
    print(num)

    patternTitle1 = re.compile('data-original-title=(["])([^"]{1,})(["])')
    objTitle1 = patternTitle1.search(tempText)
    tempTitle = objTitle1.group(0)
    patternTitle2 = re.compile('(["])([^"]{1,})(["])')
    objTitle2 = patternTitle2.search(tempTitle)
    title = objTitle2.group(0)[1:-1]
    print(title)

    patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
    objPassRate = patternPassRate.search(tempText)
    passRate = objPassRate.group(0)
    print(passRate)

    patternLevel = re.compile('level-([a-z]{1,})')
    objLevel = patternLevel.search(tempText)
    idxLevel = objLevel.group(0).find('-')
    level = objLevel.group(0)[idxLevel + 1:]
    dicLevel = {}
    print(level)

    patternSolution1 = re.compile('class="solution-link">([0-9]{1,})([<])')
    objSolution1 = patternSolution1.search(tempText)
    tempSolution = objSolution1.group(0)
    patternSolution2 = re.compile('([0-9]{1,})')
    objSolution2 = patternSolution2.search(tempSolution)
    solution = objSolution2.group(0)
    print(solution)

    patternHref1 = re.compile('<a href="([^"]{1,})(["])')
    objHref1 = patternHref1.search(tempText)
    tempHref = objHref1.group(0)
    patternHref2 = re.compile('(["])([^"]{1,})(["])')
    objHref2 = patternHref2.search(tempHref)
    suffixHref = objHref2.group(0)[1:-1]
    fullHref = "https://leetcode-cn.com" + suffixHref
    print(fullHref)

    infoList = []
    infoList.append(num)
    infoList.append(title)
    infoList.append(level)
    infoList.append(passRate)
    infoList.append(solution)
    infoList.append(fullHref)
    return infoList

首先是看LeetCode对应界面的源码(F12看,还有直接右键检查),主要是看网页布局,看看如何使用正则匹配,因为标签较多,要找到合适的参考系(乐了,物理学渣瑟瑟发抖)。 下面基本上是连连看,大部分都是两次正则匹配出结果,大家可以先看看正则匹配如何使用,这其实是非常好用的搜索方式,即使是会使用beautiful soup也最好看看正则应该怎么做。


写文件部分

def WriteFile(path: str) -> bool:
    curTime = time.strftime("%Y-%m-%d", time.localtime())
    try:
        ofile = open(path, "r+")
    except IOError:
        try:
            ofile = open(path, "w+")
        except IOError:
            return False
        else:
            pass
    else:
        fulltext = ofile.read()
        if curTime in fulltext:
            return True

        pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
        itemList = MatchItems(pageText)
        ofile.write("|")
        ofile.write(curTime + "\t")
        for i in range(len(itemList)):
            ofile.write("|")
            ofile.write(itemList[i] + "\t\t")
            if i == 1:
                ofile.write("\t")
        ofile.write("|")
        ofile.write("\n")
        ofile.close()
    return True

这里的try except主要是为了避免文件指针为空(人话:没读到那个文件)后依然进行操作引起程序出错。 做好各种检验工作后正常调用前面两个部分的函数。得到的结果按要求写入文件。这里主要是为了兼顾markdown语法和排版,所以加了制表符和|。最后注意关闭文件!这个操作蛮重要的,上次C++写文件没有close然后各种g,23333


旧版完整代码

贴一下完整代码,他这个代码风格好像对python兼容挺差的,很多高亮没做好,最好是复制到pycharm (什么,为啥要用pycharm,我还有100种python ide)上看。

from selenium import webdriver
import time
import re
import sys


def GetPageText(url: str) -> str:
    browser = webdriver.Chrome()        #打开内置chrome页面
    browser.get(url)        #访问传入Url(str类型)
    time.sleep(10)      #休眠时间,避免数据遗漏,等待浏览器响应
    page_text = browser.page_source     #调用库函数读入
    idxStart = page_text.find("每日 1 题")     #读入起始位置
    idxEnd = page_text.find("可查看该题目在真实面试中的出现频率")        #读入终止位置
    pageText = page_text[idxStart:idxEnd]       #做文本切片
    browser.quit()      #关闭浏览器
    return pageText     #返回切片内容


def MatchItems(tempText: str) -> list:  
    print(tempText)     #检查切片内容

    patternNum1 = re.compile('label="\[object Object]">([0-9]{1,})([<])')   #利用正则表达式读入标签做粗略筛选
    objNum1 = patternNum1.search(tempText)     #得到正则匹配对象
    tempNum = objNum1.group(0)      #获取匹配对象的字符串
    patternNum2 = re.compile('([0-9]{1,})')     #再次利用正则表达式做精确筛选
    objNum2 = patternNum2.search(tempNum)   
    num = objNum2.group(0)
    print(num)      #检查匹配值

    patternTitle1 = re.compile('data-original-title=(["])([^"]{1,})(["])')  #同上述步骤操作
    objTitle1 = patternTitle1.search(tempText)
    tempTitle = objTitle1.group(0)
    patternTitle2 = re.compile('(["])([^"]{1,})(["])')
    objTitle2 = patternTitle2.search(tempTitle)
    title = objTitle2.group(0)[1:-1]
    print(title)

    patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
    objPassRate = patternPassRate.search(tempText)
    passRate = objPassRate.group(0)
    print(passRate)

    patternLevel = re.compile('level-([a-z]{1,})')
    objLevel = patternLevel.search(tempText)
    idxLevel = objLevel.group(0).find('-')
    level = objLevel.group(0)[idxLevel + 1:]
    dicLevel = {}
    print(level)

    patternSolution1 = re.compile('class="solution-link">([0-9]{1,})([<])')
    objSolution1 = patternSolution1.search(tempText)
    tempSolution = objSolution1.group(0)
    patternSolution2 = re.compile('([0-9]{1,})')
    objSolution2 = patternSolution2.search(tempSolution)
    solution = objSolution2.group(0)
    print(solution)

    patternHref1 = re.compile('<a href="([^"]{1,})(["])')
    objHref1 = patternHref1.search(tempText)
    tempHref = objHref1.group(0)
    patternHref2 = re.compile('(["])([^"]{1,})(["])')
    objHref2 = patternHref2.search(tempHref)
    suffixHref = objHref2.group(0)[1:-1]
    fullHref = "https://leetcode-cn.com" + suffixHref       #这里需要将爬取的内容加上前缀成为完整的超链接
    print(fullHref)

    infoList = []
    infoList.append(num)
    infoList.append(title)
    infoList.append(level)
    infoList.append(passRate)
    infoList.append(solution)
    infoList.append(fullHref)   #整理回传的列表
    return infoList


def WriteFile(path: str) -> bool:
    curTime = time.strftime("%Y-%m-%d", time.localtime())   #读入当前时间
    try:
        ofile = open(path, "r+")
    except IOError:
        try:
            ofile = open(path, "w+")
        except IOError:
            return False
        else:
            pass
    else:
        fulltext = ofile.read()     #检测是否能够进行文件IO,若成功读入文件则进行后续步骤
        if curTime in fulltext:     #若当前日期已经写入,则退出程序
            return True

        pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
        itemList = MatchItems(pageText)     #先后调用爬取网页和数据处理函数
        ofile.write("|")
        ofile.write(curTime + "\t")
        for i in range(len(itemList)):
            ofile.write("|")
            ofile.write(itemList[i] + "\t\t")
            if i == 1:
                ofile.write("\t")
        ofile.write("|")
        ofile.write("\n")       #控制打印格式
        ofile.close()       #正常关闭文件
    return True


isRunning = WriteFile("C:\\leetcode\\leetcode.txt")     #传入文件地址
sys.exit(not isRunning)


新版完整代码

由于力扣官方改版,对网页布局调整,原先的代码寄了,贴一下现在的代码,整体结构类似,仅修改了匹配部分,注释参考旧版

from selenium import webdriver
import time
import re
import sys


def GetPageText(url: str) -> str:
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(10)
    page_text = browser.page_source
    idxStart = page_text.find("tr data-row-key")
    idxEnd = page_text.find('tr data-row-key="1.')
    pageText = page_text[idxStart:idxEnd]
    browser.quit()
    return pageText


def MatchItems(tempText: str) -> list:
    print(tempText)

    patternNum1 = re.compile('tr data-row-key=(["])([^0-9]{0,})([0-9]{1,})([.])')
    objNum1 = patternNum1.search(tempText)
    tempNum = objNum1.group(0)
    patternNum2 = re.compile('(["])([^0-9]{0,})([0-9]{1,})')
    objNum2 = patternNum2.search(tempNum)
    num = objNum2.group(0)
    num = num[1:]
    print(num)

    patternTitle1 = re.compile('class="h-5 truncate hover:text-primary-s dark:hover:text-dark-primary-s">([^0-9]{0,}[0-9]{1,})([.])([^>]{1,})([>])')
    objTitle1 = patternTitle1.search(tempText)
    tempTitle = objTitle1.group(0)
    patternTitle2 = re.compile('([.])([^<]{1,})([<])')
    objTitle2 = patternTitle2.search(tempTitle)
    title = objTitle2.group(0)[1:-1]
    print(title)

    patternPassRate = re.compile('([0-9]{1,})([.])([0-9]{1,})([%])')
    objPassRate = patternPassRate.search(tempText)
    passRate = objPassRate.group(0)
    print(passRate)

    levelArray = ["简单", "中等", "困难"]
    level = ""
    for testlevel in levelArray:
        if testlevel in tempText:
            level = testlevel
    print(level)

    patternSolution1 = re.compile('class="truncate">([0-9]{1,})([<])')
    objSolution1 = patternSolution1.search(tempText)
    tempSolution = objSolution1.group(0)
    patternSolution2 = re.compile('([0-9]{1,})')
    objSolution2 = patternSolution2.search(tempSolution)
    solution = objSolution2.group(0)
    print(solution)

    patternHref1 = re.compile('<a href="([^"]{1,})(["])')
    objHref1 = patternHref1.search(tempText)
    tempHref = objHref1.group(0)
    patternHref2 = re.compile('(["])([^"]{1,})(["])')
    objHref2 = patternHref2.search(tempHref)
    suffixHref = objHref2.group(0)[1:-1]
    fullHref = "https://leetcode-cn.com" + suffixHref
    print(fullHref)

    infoList = []
    infoList.append(num)
    infoList.append(title)
    infoList.append(level)
    infoList.append(passRate)
    infoList.append(solution)
    infoList.append(fullHref)
    return infoList


def WriteFile(path: str) -> bool:
    curTime = time.strftime("%Y-%m-%d", time.localtime())
    try:
        ofile = open(path, "r+")
    except IOError:
        try:
            ofile = open(path, "w+")
        except IOError:
            return False
        else:
            pass
    else:
        fulltext = ofile.read()
        if curTime in fulltext:
            return True

        pageText = GetPageText("https://leetcode-cn.com/problemset/all/")
        itemList = MatchItems(pageText)
        ofile.write("|")
        ofile.write(curTime + "\t")
        for i in range(len(itemList)):
            ofile.write("|")
            ofile.write(itemList[i] + "\t\t")
            if i == 1:
                ofile.write("\t")
        ofile.write("|")
        ofile.write("\n")
        ofile.close()
    return True


isRunning = WriteFile("C:\\leetcode\\leetcode.txt")
sys.exit(not isRunning)

文章作者: Commander
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 Commander !
  目录