Question

我是一个新的蟒蛇。在我的工作中，我打开大量的数据。所以我开始研究python以提高效率。第一个小试验是：找到两个坐标之间的最近距离。我有两个文件，一个名为＆＃34; book.csv＆＃34;，另一个名为＆＃34; macro.csv＆＃34;。[文件内容屏幕截图] [1]

book.csv有三列：BookName，Longitude，Latitude; macro.csv有threed列：MacroName，Longitude，Latitude。试用目的是为每本书找到最近的宏。我尝试使用熊猫来完成这个试验，现在我可以得到正确的结果，但效率有点低，当我有1500本书和200宏时，需要大约15秒。 请帮助我是否可以提高效率。以下是我的试用代码：

#import pandas lib
from pandas import Series,DataFrame
import pandas as pd

#import geopy lib, to calculate the distance between two poins
import geopy.distance

#def func, to calculate the distance, input parameter: two points coordinates(Lat,Lon),return m
def dist(coord1,coord2):
        return geopy.distance.vincenty(coord1, coord2).m

#def func, to find the nearest result: including MacroName and distance
def find_nearest_macro(df_macro,df_book):
    #Get column content from dataframe to series
    # Macro
    s_macro_name = df_macro["MacroName"]
    s_macro_Lat = df_macro["Latitude"]
    s_macro_Lon = df_macro["Longitude"]
    # Book
    s_book_name = df_book["BookName"]
    s_book_Lat = df_book["Latitude"]
    s_book_Lon = df_book["Longitude"]

    #def a empty list, used to append nearest result
    nearest_macro = []
    nearest_dist = []

    #Loop through each book
    ibook = 0
    while ibook < len(s_book_name):
        #Give initial value to result
        nearest_macro_name = s_macro_name[0]
        nearest_macro_dist = dist((s_book_Lat[0],s_book_Lon[0]), (s_macro_Lat[0],s_macro_Lon[0]))

        #Get the coordinate of the x book
        book_coord = (s_book_Lat[ibook],s_book_Lon[ibook])

        #Loop through each Macro, Reset the loop variable
        imacro = 1
        while imacro < len(s_macro_name):
            # Get the coordinate of the x Macro
            macro_cood = (s_macro_Lat[imacro],s_macro_Lon[imacro])
            #Calculate the distance between book and macro
            tempd = dist(book_coord,macro_cood)
            #if distance more close
            if tempd < nearest_macro_dist:
                #Update the result
                nearest_macro_dist = tempd
                nearest_macro_name = s_macro_name[imacro]
            #Increments the loop variable
            imacro = imacro + 1
        #Loop over each book, append the nearest to the result
        nearest_macro.append(nearest_macro_name)
        nearest_dist.append(nearest_macro_dist)
        # Increments the loop variable
        ibook = ibook + 1
    #return nearest macro name and distance(by tuple way can return 2 results
    return (nearest_macro,nearest_dist)

# Assign the filename:
file_macro = '.\\TestFile\\Macro.csv'
file_book = '.\\TestFile\\Book.csv'

#read content from csv to dataframe
df_macro = pd.read_csv(file_macro)
df_book = pd.read_csv(file_book)

#find the nearest macro name and distance
t_nearest_result = find_nearest_macro(df_macro,df_book)

#create a new series, convert list to Series
s_nearest_marco_name =  Series(t_nearest_result[0])
s_nearest_macro_dist = Series(t_nearest_result[1])

#insert the new Series to dataframe
df_book["NearestMacro"] = s_nearest_marco_name
df_book["NearestDist"] = s_nearest_macro_dist
print(df_book.head())

# write the new df_book to a new csv file
df_book.to_csv('.\\TestFile\\nearest.csv')

如何通过pandas提高while循环的效率

0 个答案: