• 欢迎访问1024小神,一个只会Python的程序猿不是一个好司机
  • 有什么想对我说的可以在留言板里给我留言哦~
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏1024小神吧

Python爬虫:听中国有声小说批量下载 v0.3

爬虫实战 1024小神 11个月前 (11-26) 827次浏览 1个评论

目标站点:

更新内容
1.上个版本只支持有声书,这个理论上支持听中国上的所有音频(包括有声书,评书,相声等),但未全部测试大概率会有不能下载的
py文件和exe文件下载链接:https://www.lanzous.com/i8m05xi

使用方法
1.输入目录页链接,如https://www.tingchina.com/pingshu/disp_1742.htm
小技巧:开启cmd快速编辑右键粘贴(看方法一即可):https://jingyan.baidu.com/article/c85b7a64618eb3003bac95d1.html
2.输入起始下载集数(第一页开始直接回车)
源代码:

import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import time
import os
 
class TingChina():
    def __init__(self,category,id,strat_num):
        self.base_url = 'https://www.tingchina.com'
        self.category = category
        self.id = id
        self.num = int(strat_num)-1
        self.name_num = int(strat_num)
        self.Referer = ''
        self.host1 = "http://t44.tingchina.com"
        self.host2 = "http://t33.tingchina.com"
        self.book_name = ''
     
    def get_total_episode(self):
        url ='https://www.tingchina.com/{}/disp_{}.htm'.format(self.category,str(self.id))
        print(url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        }
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            response.encoding='gbk'
            soup = BeautifulSoup(response.text,'lxml')
            ul = soup.select('div.list > ul')[0]
            lis = ul.select('li')
            for i in range(len(lis)-1,len(lis)-4,-1):
                matched = re.search('play.*?_(\d+)\.htm',str(lis[i]))
                if matched:
                    num = int(matched.group(1))
                    break
            name = soup.select('title')[0].string
            return name,num+1
             
 
    def get_flash_url(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
            }
        second_url = 'https://www.tingchina.com/{}/play/play_{}_{}.htm'.format(self.category,str(self.id),str(self.num))
        url = 'https://www.tingchina.com/{}/{}/play_{}_{}.htm'.format(self.category,str(self.id),str(self.id),str(self.num))
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            response.encoding='gbk'
            return response.text,url
        else:
            response = requests.get(second_url,headers=headers)
            if response.status_code==200:
                response.encoding='gbk'
                return response.text,second_url
     
    def parse_flash_url(self):
        html,url = self.get_flash_url()
        soup = BeautifulSoup(html,'lxml')
        src = soup.select('#playdiv')[0].iframe['src']
        self.Referer = url
        flei_matched = re.search('flei=(.*?)&',src)
        bookname_matched = re.search('bookname=(.*?)&',src)
        filename_matched = re.search('filename=(.*?)&',src)
        info = {}
         
        if flei_matched:
            info['flei'] = flei_matched.group(1)
        if bookname_matched:
            info['bookname'] = bookname_matched.group(1)
        if filename_matched:
            info['filename'] = filename_matched.group(1)
        if len(info)==3:
            real_address = self.host1+'/{}/{}/{}/{}'.format(self.category,info['flei'],info['bookname'],info['filename'])
        elif len(info)==2:
            if not 'flei' in info.keys():
                real_address = self.host1+'/{}/{}/{}'.format(self.category,info['bookname'],info['filename'])
            if not 'bookname' in info.keys():
                real_address = self.host1+'/{}/{}/{}'.format(self.category,info['flei'],info['filename'])
        else:
            real_address = self.host1+'/{}/{}'.format(self.category,info['filename'])
        # print('real_address',real_address)
        return src,url,real_address
     
    def get_audio(self):
        '''get key 和 real_address拼接得到可以访问的地址'''
        temp_url,Referer,real_address =self.parse_flash_url()
        url = self.base_url + temp_url
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
            'Referer': Referer
        }
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            # print(response.apparent_encoding)
            response.encoding='utf-8'
            matched = re.search('url\[3\]= ".*?(key=.*?)";',response.text,re.S)
            if matched:
                # print(matched.group(1))
                return(real_address+'?'+matched.group(1))
     
    def download(self):
        url = self.get_audio()
        print(url)
        if url:
            downloadFILE(url,os.path.join(self.book_name,str(self.name_num).zfill(4)+'.mp3'),self.Referer)
 
    def run(self):
        name,total_episode = self.get_total_episode()
        print('书名:',name,'集数:',total_episode)
        self.book_name = name
        if not os.path.exists(name):
            os.makedirs(name)
        while True:
            if self.name_num > total_episode:
                print('all assignments done!')
                break
            try:
                self.download()
            except Exception as e:
                print(self.name_num,e)
                with open('log.txt','a',encoding='utf-8') as f:
                    f.write(str(self.name_num)+str(e)+'\n')
            self.num+=1
            self.name_num+=1
         
 
def downloadFILE(url,name,Referer):
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
        'Referer': Referer
    }
    resp = requests.get(url=url,stream=True,headers=headers)
    content_size = int(int(resp.headers['Content-Length'])/1024)
    with open(name, "wb") as f:
        print("Pkg total size is:",content_size,'k,start...')
        for data in tqdm(iterable=resp.iter_content(1024),total=content_size,unit='k',desc=name):
            f.write(data)
        print(name , "download finished!")
 
if __name__ == "__main__":
    # disp_url =  'https://www.tingchina.com/yousheng/disp_21501.htm'
    disp_url = input('请输入目录页链接如:https://www.tingchina.com/yousheng/disp_21501.htm:')
    matched_category_id = re.search('tingchina\.com/(\w+)/disp_(\d+).htm',disp_url)
    if matched_category_id:
        category = matched_category_id.group(1)
        id = int(matched_category_id.group(2))
        if id and category:
            start_num = input('请输入开始下载的集数(直接回车从第一集开始下载)')
            if start_num:
                t = TingChina(category,id,int(start_num))
                t.run()
            else:
                t = TingChina(category,id,1)
                t.run()
    else:
        print('输入的链接无法解析')
     
    # pyinstaller --onefile --windowed --icon=bitbug_favicon.ico tingchina_v0.3.py
    # pyinstaller -F -i bitbug_favicon.ico tingchina_v0.3.py
    # t = TingChina('yousheng',21501,143)
    # t = TingChina('pingshu',1660,126)
    # t = TingChina('xiangsheng',12567,1)
    # t = TingChina('erge',433,12)
    # t = TingChina('xiaohua',233,248)
    # t.run()

如有失效,请留言告知丨转载请注明原文链接:Python爬虫:听中国有声小说批量下载 v0.3
点赞 (1)

您必须 登录 才能发表评论!

(1)个小伙伴在吐槽
  1. bj2415
    这下有小说听了
    2020-12-30 14:15