从网站下载单一超大文件有时超时问题

下载大文件时(这里只针对单一文件),可能会由于网络波动,时间不够等等原因,导致下载失败。

下面这串代码就是用于解决上述问题:

1.使用多线程加速下载,避免下载时间过长2.实现断点续传,在因为各种原因中断导致下载失败后,能接着前面的文件继续下载。

登录后复制 # -*- codeing=utf-8 -*-

# @Time:2022/5/29 20:33

# @Author:Ye Zhoubing

# @File: download_large_file.py

# @software:PyCharm

"""

python 多线程下载大文件,并实现断点续传

"""

```python

# -*- codeing=utf-8 -*-

# @Time:2022/5/29 20:33

# @Author:Ye Zhoubing

# @File: download_large_file.py

# @software:PyCharm

"""

python 多线程下载大文件,并实现断点续传

"""

import os

import time

import httpx

from tqdm import tqdm

from threading import Thread

import datetime

import sys

class Logger(object):

def __init__(self, filename='default.log', stream=sys.stdout):

self.terminal = stream

self.log = open(filename, 'w' , encoding = 'utf-8')

def write(self, message):

self.terminal.write(message)

self.log.write(message)

def flush(self):

pass

class DownloadFile(object):

def __init__(self, download_url, data_folder, thread_num):

"""

:param download_url: 文件下载连接

:param data_folder: 文件存储目录

:param thread_num: 开辟线程数量

"""

self.download_url = download_url

self.data_folder = data_folder

self.thread_num = thread_num

self.file_size = None

self.cut_size = None

self.tqdm_obj = None

self.thread_list = []

self.file_path = os.path.join(self.data_folder, download_url.split('/')[-1])

def downloader(self, etag, thread_index, start_index, stop_index, retry=False):

sub_path_file = "{}_{}".format(self.file_path, thread_index)

if os.path.exists(sub_path_file):

temp_size = os.path.getsize(sub_path_file) # 本地已经下载的文件大小

if not retry:

self.tqdm_obj.update(temp_size) # 更新下载进度条

else:

temp_size = 0

if stop_index == '-': stop_index = ""

headers = {'Range': 'bytes={}-{}'.format(start_index + temp_size, stop_index),

'ETag': etag, 'if-Range': etag,

}

down_file = open(sub_path_file, 'ab')

try:

with httpx.stream("GET", self.download_url, headers=headers) as response:

num_bytes_downloaded = response.num_bytes_downloaded

for chunk in response.iter_bytes():

if chunk:

down_file.write(chunk)

self.tqdm_obj.update(response.num_bytes_downloaded - num_bytes_downloaded)

num_bytes_downloaded = response.num_bytes_downloaded

except Exception as e:

print("Thread-{}:请求超时,尝试重连\n报错信息:{}".format(thread_index, e))

self.downloader(etag, thread_index, start_index, stop_index, retry=True)

finally:

down_file.close()

return

def get_file_size(self):

"""

获取预下载文件大小和文件etag

:return:

"""

with httpx.stream("GET", self.download_url) as response2:

etag = ''

total_size = int(response2.headers["Content-Length"])

for tltle in response2.headers.raw:

if tltle[0].decode() == "ETag":

etag = tltle[1].decode()

break

return total_size, etag

def cutting(self):

"""

切割成若干份

:param file_size: 下载文件大小

:param thread_num: 线程数量

:return:

"""

cut_info = {}

cut_size = self.file_size // self.thread_num

for num in range(1, self.thread_num + 1):

if num != 1:

cut_info[num] = [cut_size, cut_size * (num - 1) + 1, cut_size * num]

else:

cut_info[num] = [cut_size, cut_size * (num - 1), cut_size * num]

if num == self.thread_num:

cut_info[num][2] = '-'

return cut_info, cut_size

def write_file(self):

"""

合并分段下载的文件

:param file_path:

:return:

"""

if os.path.exists(self.file_path):

if len(self.file_path) >= self.file_size:

return

with open(self.file_path, 'ab') as f_count:

for thread_index in range(1, self.thread_num + 1):

with open("{}_{}".format(self.file_path, thread_index), 'rb') as sub_write:

f_count.write(sub_write.read())

# 合并完成删除子文件

os.remove("{}_{}".format(self.file_path, thread_index))

return

def create_thread(self, etag, cut_info):

"""

开辟多线程下载

:param file_path: 文件存储路径

:param etag: headers校验

:param cut_info:

:return:

"""

for thread_index in range(1, self.thread_num + 1):

thread = Thread(target=self.downloader,

args=(etag, thread_index, cut_info[thread_index][1], cut_info[thread_index][2]))

thread.setName('Thread-{}'.format(thread_index))

thread.setDaemon(True)

thread.start()

self.thread_list.append(thread)

for thread in self.thread_list:

thread.join()

return

def check_thread_status(self):

"""

查询线程状态。

:return:

"""

while True:

for thread in self.thread_list:

thread_name = thread.getName()

if not thread.isAlive():

print("{}:已停止".format(thread_name))

time.sleep(1)

def create_data(self):

if not os.path.exists(self.data_folder):

os.mkdir(self.data_folder)

return

def main(self):

# 平分几份

self.create_data()

self.file_size, etag = self.get_file_size()

# 按线程数量均匀切割下载文件

cut_info, self.cut_size = self.cutting()

# 下载文件名称

# 创建下载进度条

self.tqdm_obj = tqdm(total=self.file_size, unit_scale=True, desc=self.file_path.split('/')[-1],

unit_divisor=1024,

unit="B")

# 开始多线程下载

self.create_thread(etag, cut_info)

# 合并多线程下载文件

self.write_file()

return

if __name__ == '__main__':

# 将控制台print的报错结果输出到log.txt文件

sys.stdout = Logger(r'log.txt', sys.stdout) #不希望生成log文件注释掉即可

# sys.stderr = Logger(r'log_file.txt', sys.stderr)

start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

print("开始时间:"+start_time)

print("==" * 20)

download_url = "https://heyulei1.github.io/videos/1.mp4"

data_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Data')

thread_num = 20 # 想提高速度可以提高线程数,但不要太高,这与电脑配置有关

downloader = DownloadFile(download_url, data_folder, thread_num)

downloader.main()

print(download_url,'完成')

end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

print("==" * 20)

print("结束时间:"+end_time+"\n")

1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.31.32.33.34.35.36.37.38.39.40.41.42.43.44.45.46.47.48.49.50.51.52.53.54.55.56.57.58.59.60.61.62.63.64.65.66.67.68.69.70.71.72.73.74.75.76.77.78.79.80.81.82.83.84.85.86.87.88.89.90.91.92.93.94.95.96.97.98.99.100.101.102.103.104.105.106.107.108.109.110.111.112.113.114.115.116.117.118.119.120.121.122.123.124.125.126.127.128.129.130.131.132.133.134.135.136.137.138.139.140.141.142.143.144.145.146.147.148.149.150.151.152.153.154.155.156.157.158.159.160.161.162.163.164.165.166.167.168.169.170.171.172.173.174.175.176.177.178.179.180.181.182.183.184.185.186.187.188.189.190.191.192.193.194.195.196.197.198.199.200.201.202.203.204.205.206.207.208.209.

Copyright © 2088 世界杯预选赛中国_1994年世界杯冠军是谁 - nywk120.com All Rights Reserved.
友情链接
Top