如何提高从ftp服务器下载数据的速度?

时间:2019-03-20 03:56:32

标签: python python-3.x download ftp ndfd

我正在编写一个Python脚本,该脚本从National Digital Forecast Database(NDFD)服务器下载数据。 FTP服务器文件的组织方式是:Year / YearMonth / YearMonthDay,我必须从每一天的文件夹中下载一个文件,然后转到该文件夹​​并在第二天下载,依此类推。我当前的代码非常慢,下载一天的数据大约需要20秒,相当于一年2小时。我希望它更快。请在下面找到我的代码。

from ftplib import FTP  
import ftplib
import os
import datetime as dt
import pandas as pd 
import time

def ndfd_download(keyword, days_, forecast_hour):

    # search for the files between 30 minutes on either side 
    # of the forecast hour -40 is to convert 100 to 60 minutes
    time_start = int(float(forecast_hour)*100 - 30 - 40)
    time_end = int(float(forecast_hour)*100 + 30)

    print('Starting connection to NOAA database')

    # Try connecting to the NCDC server
    try:
        ftp = FTP('nomads.ncdc.noaa.gov') 
        ftp.login()
        print('Connect successful')
    except ftplib.all_errors as e:
        errorcode_string = str(e).split(None, 1)[0]
        print(errorcode_string)

    ftp.cwd('/NDFD/')
    print('Current working directory is %s' % ftp.pwd())

    # go through all the days
    for day_ in days_: 

        start = time.time()

        # get year, month, day information from day_
        year = "{:02d}".format(day_.year)
        year_month = "{:02d}".format(day_.year) + "{:02d}".format(day_.month)
        year_month_day = "{:02d}".format(day_.year) + "{:02d}".format(day_.month) + "{:02d}".format(day_.day)

        try: 

            # Change to the desired NDFD directory to get your data
            # print('Changing directory to \"/NDFD/{}/{}/\"'.format(month, day))
            ftp.cwd('/NDFD/{}/{}/'.format(year_month, year_month_day))

            # getting names of all files in the current working directory
            all_files = ftp.nlst()

            # filtering all the files with desired keyword
            all_files = [key for key in (all_files) if key.startswith(keyword)]

            # creating a directory to store the data
            directoryName = '{}/{}/{}'.format(year, year_month, year_month_day)
            if not os.path.exists(directoryName):
                os.makedirs(directoryName)

            # Move into the folder
            directoryPath = '%s/%s' % (os.getcwd(), directoryName)
            os.chdir(directoryPath)

            print('Downloading data for {}'.format(year_month_day))

            # go through all the files in the directory
            for f in all_files: 

                # get the last 4 characters of file name
                # they contain the time of forecast
                file_time = float(f[-4:])

                # check if time of forecast is within our bounds
                if (file_time <= time_end and file_time >= time_start):

                    # open a new file
                    file = open(f, 'wb')

                    try:
                        # save the file with the same name
                        ftp.retrbinary('RETR %s' % f, file.write)
                        # print('Successfully downloaded: {}'.format(f))
                    except ftplib.all_errors as e:
                        errorcode_string = str(e).split(None, 1)[0]
                        print('Error', errorcode_string) 

                    file.close()

            # going 3 directories up 
            os.chdir("../../..")

        except ftplib.error_perm as e:
            errorcode_string = str(e).split(None, 1)[0]
            print('Error', e) 

        print(time.time() - start)


    ftp.close()

if __name__ == "__main__":
    keyword = "YAUZ98"
    years = [2018]
    for year in years: 
        month = 1
        day = 30
        days_ = []
        # no_of_days = 366 if calendar.isleap(year) else 365
        no_of_days = 100
        t = dt.datetime(year,month,day)
        for i in range(no_of_days):
            days_.append((t))
            t = t + dt.timedelta(days = 1)

        forecast_hour = '14'
        ndfd_download(keyword, days_, forecast_hour)

0 个答案:

没有答案