所以,我正在清理一个数据集,该数据集的时间戳为第一列,价格和数量为下两列。我试图使用几个逻辑删除坏行,然后输出一个具有所有错误滴答的文件和另一个具有所有良好滴答的文件。除了删除重复项之外,每个逻辑似乎都有效。我最终得到的行数比我开始时少:
from datetime import datetime
to_sort = list()
noise= list()
line_counter=0
with open("hi.txt", 'r') as f:
for line in f:
#splitting the lines using the delimiter comma
splitted_line = line.strip().split(",")
#stripping the time using the datetime function from the datetime library
date1 = datetime.strptime(splitted_line[0],'%Y%m%d:%H:%M:%S.%f')
#creating columns for volume and price
price = float(splitted_line[1])
volume = int(splitted_line[2])
#creating a tuple using date as first column, price as second and volume as third
my_tuple=(date1,price,volume)
#EDA shows that the prices are between 0 and 3000 and volume must be greater than zero
if price > 0 and price<3000 and volume >0:
to_sort.append(my_tuple)
else:
noise.append(my_tuple)
line_counter +=1
if line_counter %13==0:
#removing duplicates using the set function
sorted_signal=sorted(set(to_sort))
with open ("true.txt","a")as s:
for line in sorted_signal:
s.write(str(line[0])+","+ str(line[1])+","+str(line[2])+"\n")
to_sort=list()
with open ("noise.txt","a")as n:
for line in noise:
n.write(str(line[0])+","+ str(line[1])+","+str(line[2])+"\n")
noise=list()