针对大型数据集优化VLOOKUP

时间:2015-02-15 15:34:27

标签: excel vba

我编写了一个代码来比较两个工作表WS1和Ws2。代码从ws1中读取每一行的主键,并在ws2中找到具有相同主键的相应行,然后在两个工作表之间匹配所有其他列属性并进行相应的报告。

代码是:

     Sub DataComparator(ws1 As Worksheet, ws2 As Worksheet)
     Dim ws1row As Long, ws2row As Long, ws1col As Integer, ws2col As Integer
     Dim maxrow As Long, maxcol As Integer, colval1 As String, colval2 As String
     Dim difference As Long, reportrow As Long, reportcol As Long, flag As Boolean
     Dim row As Long, col As Long, pki As Long, pk As String, counter As Long
     Dim PctDone As Single, cell1 As String, cell2 As String, bfailed As Boolean

     TestDataComparator.FrameProgress.Visible = True
     TestDataComparator.LabelProgress.Visible = True

     'UserForm1.Visible = True
     'Application.ScreenUpdating = False
     DoEvents

         With ws1.UsedRange
            ws1row = .Rows.Count
            ws1col = .Columns.Count
         End With

         With ws2.UsedRange
            ws2row = .Rows.Count
            ws2col = .Columns.Count
        End With
        maxrow = ws1row
        maxcol = ws1col

       pk = UCase(TestDataComparator.TextBox1.Value)

       For col = 1 To maxcol
           If pk = UCase(ws1.Cells(1, col).Formula) Then
               pki = col
           End If
       Next col

       If maxrow < ws2row Then maxrow = ws2row
       If maxcol < ws2col Then maxcol = ws2col

       difference = 0
       reportrow = 0
       For row = 2 To maxrow
           keyval = ws1.Cells(row, 1).Formula
           flag = False
           bfailed = False
           'reportcol = 1

           For col = 2 To maxcol
               'If col = pki Then
               'Exit For
               'End If
               counter = counter + 1
               cell1 = ""
               cell2 = ""
               cell1 = ws1.Cells(row, col).Formula
           On Error Resume Next
           'Set Rng = Range("A2:" & Cells(ws2row, "A").Address)
           cell2 = Application.WorksheetFunction.VLookup(keyval,  ws2.UsedRange, col, False)
           If Err.Number <> 0 Then bfailed = True
           On Error GoTo 0
           If bfailed = True Then
               Exit For
           End If
           If cell1 <> cell2 Then
              flag = True
              'difference = difference + 1
              diffcolname = ws1.Cells(1, col)
              ws1.Cells(row, col).Interior.Color = RGB(255, 255, 0)
              ws1.Cells(1, col).Interior.Color = RGB(255, 0, 0)
              ws1.Cells(row, col).Font.Bold = True
              ws1.Cells(1, pki).Interior.Color = RGB(0, 255, 0)
              ws1.Cells(row, pki).Interior.Color = RGB(255, 255, 0)
              ws1.Cells(row, pki).Font.Color = RGB(255, 0, 0)
              ws1.Cells(row, pki).Font.Bold = True
       End If

    Next col
    If flag = True Then
          reportrow = reportrow + 1
    End If
    PctDone = counter / (maxrow * maxcol)
    TestDataComparator.FrameProgress.Caption = "Progress..." &  Format(PctDone, "0%")
    TestDataComparator.LabelProgress.Width = PctDone * (TestDataComparator.FrameProgress.Width - 10)
    DoEvents
  Next row

  TestDataComparator.Totalcount.Value = row - 2
  TestDataComparator.mismatchCount.Value = reportrow
  TestDataComparator.mismatchCount.Font = Bold

  difference = 0
  For col = 1 To maxcol
          If ws1.Cells(1, col).Interior.Color = RGB(255, 0, 0) Then
              difference = difference + 1
              TestDataComparator.AttributeNameList.AddItem (ws1.Cells(1, col))
          End If
  Next col

  TestDataComparator.FrameProgress.Visible = False
  TestDataComparator.LabelProgress.Visible = False
  'TestDataComparator.PleaseWait.Visible = False
   MsgBox difference & " columns contain different data! ", vbInformation, "Comparing Two Worksheets"

   Application.ScreenUpdating = True

  End Sub

我希望vlookup函数仅在WS2的整个列中搜索匹配,该列具有主键(index pki)而不是ws2.UsedRange。请提供建议。有没有比vlookup更好的选项? ws2.UsedRange的使用使得在大型数据集中搜索变得困难,这就是我想减少搜索空间的原因。我的数据集在excel中有超过40K行和155列。如果您认为不合适,还建议我计算进度条进度的公式。

来自OP评论的示例数据:

Name    Height  Weight
Jane    5'6''   78
Mike    5'4''   89
Monica  5'2''   56

2 个答案:

答案 0 :(得分:2)

我认为使用词典(在其他语言中也称为Hashtable)可以使它更快。您需要引用Microsoft Scripting Runtime库。

在开始逐行浏览ws1之前,您需要在一个循环中将ws2键值及其行号读入Dictionary中。然后在循环中,只需在字典中查找值,即可在ws2上获取行号。像这样:

Dim ws2keys As Dictionary
Set ws2keys = New Dictionary
' assuming you have a header row
For row = 2 To ws2.UsedRange.Rows.Count
    keyValue = ws1.Cells(row, 1).Value
    If keyValue <> "" Then ws2keys.Add(keyValue, row)
Next
' your dictionary is ready

然后在循环中,而不是在ws1上逐行使用VLookup:

ws2RowIndex = ws2keys.Item(ws1KeyValueYouAreLookingFor)

(代码可能不完美,我在这台机器上没有任何与微软相关的内容来检查语法,抱歉。)

答案 1 :(得分:0)

我已将每列的VLOOKUP缩减为单个MATCH以验证其是否存在,并将MATCH设置为匹配发生的WS2行。其他一切都是通过直接寻址完成​​的。

Sub DataComparator(ws1 As Worksheet, ws2 As Worksheet)
    Dim ws1row As Long, ws2row As Long, ws1col As Long, ws2col As Long
    Dim maxrow As Long, maxcol As Long, colval1 As String, colval2 As String
    Dim difference As Long, reportrow As Long, reportcol As Long, flag As Boolean
    Dim rw As Long, cl As Long, pki As Long, pk As String, counter As Long
    Dim cell1 As String, cell2 As String, bfailed As Boolean
    Dim iPCT As Long, ws2rw As Long, rWS1cr As Range, rWS2cr As Range, keyval As Variant, app As Application

    Set app = Application
    'UserForm1.Visible = True
    app.ScreenUpdating = False
    'DoEvents

    With ws1.Cells(1, 1).CurrentRegion
        Set rWS1cr = .Cells
        ws1row = .Rows.Count
        ws1col = .Columns.Count
    End With

    With ws2.Cells(1, 1).CurrentRegion
        Set rWS2cr = .Cells
        ws2row = .Rows.Count
        ws2col = .Columns.Count
    End With
    maxrow = ws1row
    maxcol = ws1col

    'pk = UCase(TestDataComparator.TextBox1.Value)
    For cl = 1 To maxcol
        If pk = UCase(rWS1cr.Cells(1, cl).Value) Then
            pki = cl
            Exit For
        End If
    Next cl

    If maxrow < ws2row Then maxrow = ws2row
    If maxcol < ws2col Then maxcol = ws2col

    difference = 0
    reportrow = 0
    With rWS1cr
        For rw = 2 To maxrow
            keyval = ws1.Cells(rw, 1).Value
            If Not IsError(app.Match(keyval, rWS2cr.Columns(1), 0)) Then
                ws2rw = app.Match(keyval, rWS2cr.Columns(1), 0)
                flag = False

                For cl = 2 To maxcol
                    counter = counter + 1
                    cell1 = vbNullString
                    cell2 = vbNullString
                    cell1 = .Cells(rw, cl).Value
                    cell2 = rWS2cr.Cells(ws2rw, cl).Value

                    If cell1 <> cell2 Then
                         flag = True
                         'diffcolname = .Cells(1, cl)
                         .Cells(rw, cl).Interior.Color = RGB(255, 255, 0)
                         .Cells(1, cl).Interior.Color = RGB(255, 0, 0)
                         .Cells(rw, cl).Font.Bold = True
                         .Cells(1, pki).Interior.Color = RGB(0, 255, 0)
                         .Cells(rw, pki).Interior.Color = RGB(255, 255, 0)
                         .Cells(rw, pki).Font.Color = RGB(255, 0, 0)
                         .Cells(rw, pki).Font.Bold = True
                    End If

                Next cl
                reportrow = reportrow - CLng(flag)
                If iPCT <> CLng((rw / maxrow) * 100) Then
                    iPCT = CLng((rw / maxrow) * 100)
                    app.StatusBar = "Progress - " & Format(iPCT, "0\%")
                End If
            End If
        Next rw
        For cl = 1 To maxcol
            If .Cells(1, cl).Interior.Color = RGB(255, 0, 0) Then
                difference = difference + 1
                'TestDataComparator.AttributeNameList.AddItem (ws1.Cells(1, col))
            End If
        Next cl
        MsgBox difference & " columns contain different data! ", vbInformation, "Comparing Two Worksheets"
    End With

    difference = 0


    app.ScreenUpdating = True
    app.StatusBar = vbNullString

    Set app = Nothing
End Sub

我更喜欢.CurrentRegion.UsedRange,因为我发现它更可靠。这段代码没有经过测试,但它确实编译了,我不得不注释掉一些外部引用来实现这一点。