Django的bulk_create()

时间:2016-03-15 20:36:29

标签: python django

我已经完成了关于Django's bulk_create()的一些阅读,我正在尝试实现它,但它没有像我期望的那样运作。最初我的代码没有使用bulk_create,导入6074行数据大约需要33秒左右。慢但它有效。

型号:

class Building(models.Model):
    community = models.ForeignKey('Community', related_name='Building Community Name')
    physical_location = models.CharField(max_length=80, null=True, blank=True)
    data_source = models.CharField(max_length=50, null=True, blank=True)
    facility_name = models.CharField(max_length=120, null=True, blank=True)
    facility_type = models.CharField(max_length=80, null=True, blank=True)
    size = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    audited = models.NullBooleanField(blank=True)
    audit_notes = models.TextField(blank=True) 

class RetrofitData(models.Model):
    building_id = models.ForeignKey('Building')
    retrofits_done = models.NullBooleanField(blank=True)
    retrofit_notes = models.TextField(blank=True)
    fuel_oil_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    district_heating_oil_usage_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    electricity_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    natural_gas_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    propane_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    fuel_oil_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    district_heating_oil_usage_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    electricity_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    natural_gas_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    propane_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    retrofit_cost = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_heat = models.NullBooleanField(blank=True)
    heat_recovery = models.NullBooleanField(blank=True)

原始代码:

class BuildingInventoryImporter(dataimport.DataFileImporter):

   def toTrueFalse(self, val):
            if val == "Yes":
                return True
            elif val == "No":
                return False
            else:
                return None

    def decCleaner(self, val):
        if val == '':
            return None
        else:
            return val2dec(val)

    models = [Building, RetrofitData]

    @transaction.commit_manually
    @rollback_on_exception
    def do_import(self):
        book = xlrd.open_workbook(self.data_file.file.path, 
            encoding_override='cp1252')
        sheet = book.sheet_by_index(2)

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)

            temp_id= row_index
            community_name = row[0]
            gnis = row[1]
            physical_location = row[2]
            data_source = row[3]
            facility_type = row[5]

            if row[4] == '':
                if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
                    facility_name = "Unavailable"
                elif facility_type =="Health Care - Hospitals":
                    facility_name = community_name + " Clinic"
                elif facility_type == "Education - K - 12":
                    facility_name = community_name + " School(s)"
            else:
                facility_name = row[4]    

            size = self.decCleaner(row[6])
            audited = self.toTrueFalse(row[7])
            audit_notes = row[8]

            building, created = self.get_or_new(Building, id=temp_id)

            try:
                community = Community.objects.get(gnis_feature_id=gnis)
            except Community.DoesNotExist:
                self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))

                try:
                    community = Community.objects.get(name=community_name)
                    self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
                except Community.DoesNotExist:
                    self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1])) 
                    continue

            building.community = community
            building.physical_location = physical_location
            building.data_source = data_source
            building.facility_name = facility_name
            building.facility_type = facility_type
            building.size = size
            building.audited = audited
            building.audit_notes = audit_notes
            building.save()

            retrofit_data, created = self.get_or_new(RetrofitData, building_id=building)

            retrofit_data.retrofits_done =  self.toTrueFalse(row[9])
            retrofit_data.retrofit_notes = row[10]
            retrofit_data.fuel_oil_preretrofit = self.decCleaner(row[11])

            if row[12] == 999999999: #They decided that a unknown value would be represented as 999999999 in the dataset.
                retrofit_data.district_heating_oil_usage_preretrofit = None
            else:
                retrofit_data.district_heating_oil_usage_preretrofit = self.decCleaner(row[12])

            retrofit_data.electricity_preretrofit = self.decCleaner(row[13])
            retrofit_data.natural_gas_preretrofit = self.decCleaner(row[14])
            retrofit_data.propane_preretrofit = self.decCleaner(row[15])
            retrofit_data.biomass_preretrofit = self.decCleaner(row[16])
            retrofit_data.fuel_oil_postretrofit = self.decCleaner(row[17])
            retrofit_data.district_heating_oil_usage_postretrofit = self.decCleaner(row[18])
            retrofit_data.electricity_postretrofit = self.decCleaner(row[19])
            retrofit_data.natural_gas_postretrofit = self.decCleaner(row[20])
            retrofit_data.propane_postretrofit = self.decCleaner(row[21])
            retrofit_data.biomass_postretrofit = self.decCleaner(row[22])
            retrofit_data.retrofit_cost = self.decCleaner(row[23])
            retrofit_data.biomass_heat = self.toTrueFalse(row[24])
            retrofit_data.heat_recovery = self.toTrueFalse(row[25])
            retrofit_data.save()

        if self.dry_run:
            transaction.rollback()
        else:
            transaction.commit()
dataimport.register(BuildingInventoryImporter)

在整个数据导入过程中,它必须像〜1200次一样打到数据库,导致导入缓慢。所以为了解决这个问题,我研究了使用bulk_create()

修改后的代码:

class BuildingInventoryImporterV2(dataimport.DataFileImporter):

    models = [Building, RetrofitData]

    def do_import(self, dry_run=True):    
        book = xlrd.open_workbook(self.data_file.file.path, 
            encoding_override='cp1252')
            sheet = book.sheet_by_index(2)

        building_bulk_list = []
        retrofit_bulk_list = [] 

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)

            temp_id= row_index
            community_name = row[0]
            gnis = row[1]
            facility_type = row[5]

            try:
                community = Community.objects.get(gnis_feature_id=gnis)
            except Community.DoesNotExist:
                self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))

                try:
                    community = Community.objects.get(name=community_name)
                    self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
                except Community.DoesNotExist:
                    self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1])) 
                    continue

            if row[4] == '':
                if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
                    facility_name = "Unavailable"
                elif facility_type =="Health Care - Hospitals":
                    facility_name = community_name + " Clinic"
                elif facility_type == "Education - K - 12":
                    facility_name = community_name + " School(s)"
            else:
                facility_name = row[4]

            building_to_add = Building(    
                community=community,    
                physical_location=row[2],    
                data_source=row[3],    
                facility_name=facility_name,    
                facility_type=facility_type,    
                size=self.decCleaner(row[6]),    
                audited=self.toTrueFalse(row[7]),    
                audit_notes=row[8]    
            )
            building_bulk_list.append(building_to_add)
        if self.dry_run is False:
            Building.objects.bulk_create(building_bulk_list)

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)
            #They decided that a unknown value would be represented as 999999999 in the dataset.

            if row[12] == 999999999:    
                district_heating_oil_usage_preretrofit = None    
            else:    
                district_heating_oil_usage_preretrofit = self.decCleaner(row[12]) 

            retrofit_data_to_add = RetrofitData(    
                building_id=Building.objects.get(id=temp_id),    
                retrofits_done=self.toTrueFalse(row[9]),    
                retrofit_notes=row[10],    
                fuel_oil_preretrofit=self.decCleaner(row[11]),    
                district_heating_oil_usage_preretrofit=district_heating_oil_usage_preretrofit,    
                electricity_preretrofit=self.decCleaner(row[13]),    
                natural_gas_preretrofit=self.decCleaner(row[14]),    
                propane_preretrofit=self.decCleaner(row[15]),    
                biomass_preretrofit=self.decCleaner(row[16]),    
                fuel_oil_postretrofit=self.decCleaner(row[17]),    
                district_heating_oil_usage_postretrofit=self.decCleaner(row[18]),    
                electricity_postretrofit=self.decCleaner(row[19]),    
                natural_gas_postretrofit=self.decCleaner(row[20]),    
                propane_postretrofit=self.decCleaner(row[21]),    
                biomass_postretrofit=self.decCleaner(row[22]),    
                retrofit_cost=self.decCleaner(row[23]),    
                biomass_heat=self.toTrueFalse(row[24]),    
                heat_recovery=self.toTrueFalse(row[25])    
            )    
            retrofit_bulk_list.append(retrofit_data_to_add) 

        if self.dry_run is False:    
            Building.objects.bulk_create(retrofit_bulk_list)    
dataimport.register(BuildingInventoryImporterV2)

当我到第二个代码块批量导入RetroFitData时出现问题。据我所知,bulk_create()在调用时不会分配AutoField pk因此你需要在分配AutoField pk之前将bulk_create()数据放在数据库中。但似乎这也不准确。运行导入后,我收到以下错误:

Traceback:
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/core/handlers/base.py" in get_response
      111.                     response = wrapped_callback(request, *callback_args, **callback_kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapped_view
      105.                     response = view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/views/decorators/cache.py" in _wrapped_view_func
      52.         response = view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/admin/sites.py" in inner
      206.             return view(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/auth/decorators.py" in _wrapped_view
      21.                 return view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/aedg/core/adminviews.py" in data_import
      465.                 results = importer.run()
    File "/home/bhernandez/ISER/aedg/core/dataimport/__init__.py" in run
      114.         self.do_import()
    File "/home/bhernandez/ISER/aedg/akw/dataimport/etc.py" in do_import
      656.                 building_id=Building.objects.get(id=temp_id),    
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/manager.py" in manager_method
      92.                 return getattr(self.get_queryset(), name)(*args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/query.py" in get
      357.                 self.model._meta.object_name)

    Exception Type: DoesNotExist at /admin/core/datafile/174/import/
    Exception Value: Building matching query does not exist.

但是,当我查看我的Buildings table it's been populated...时,我们非常感谢任何帮助或建议。

1 个答案:

答案 0 :(得分:0)

因此,当您开始创建RetrofitData时,您需要知道您刚刚创建的Building个对象的ID。

您可能正在使用ID字段设置为自动增量的数据库,因此您使用bulk_create创建的对象无法获得PK分配。

我想使用building_bulk_list您可以使用它的长度来从数据库中获取最后一组Building个对象,但是为什么不创建building_bulk_list更多的save()创建对象的传统方法,调用bulk_create,允许您创建ID列表?

然后使用该ID列表,您可以为RetrofitData运行Building,迭代该ID列表以设置与<Ip_Adress> <User_Id> <User_Name> 的关系?