Find matches in 2 cols using Collection vs Dictionary

  • Thread starter Thread starter GS
  • Start date Start date
G

GS

I've been working on this with Ron Rosefeld and Jim Cone to find an
optimum solution. I'm pleased to provide the following function for
review/testing/feedback.

The test data was 2 cols by 500,000 rows of random generated numbers
formatted as "0000000000000" so we'd have leading zeros.

The test machine is a 1.6Ghz dual core Dell Precision series laptop
running XP SP3 and Excel2007. Times are approximate, as per method
shown in function, and are as follows:

Allow duplicate values: 9secs
Allow unique values: 10secs

This is a considerable performance improvement over using Dictionary,
plus no ref to the Microsoft Scripting Runtime is needed.

I'd be pleased to here results from running this on other machines.
Here's the code I used to set up the data...


Sub Setup_Data_StripDupes()
With Range("A1:B500000")
.Formula = "=text(randbetween(1,10^6),""0000000000000"")"
.Value = .Value
End With
End Sub


Function StripDupes(Optional AllowDupes As Boolean = True) As Boolean
' Compares colA to colB and removes colA matches found in colB.
' Args In: AllowDupes: True by default. Keeps duplicate values
' found in colA that are not found in colB. If False,
' duplicate values in colA not found in colB are removed.
'
' Returns: True if matches found and no error occurs;
' False if matches not found --OR-- error occurs.
'
' Sources: Ron Rosenfeld, Jim Cone, Garry Sansom

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut() 'as variant

vRngA = Range("A1:A" & Cells(Rows.Count, "A").End(xlUp).Row)
vRngB = Range("B1:B" & Cells(Rows.Count, "B").End(xlUp).Row)

Debug.Print Now()
Dim dRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
dRngB.Add Key:=CStr(vRngB(j, 1)), Item:=CStr(vRngB(j, 1))
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If dRngB.Item(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
dRngB.Add Key:=CStr(vRngA(i, 1)), Item:=CStr(vRngA(i, 1))
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

Range("A1:A" & lRows1).ClearContents
Range("A1").Resize(UBound(vRngOut), 1) = vRngOut
Debug.Print Now()

ErrExit:
StripDupes = (Err = 0)
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()
 
Ron Rosenfeld, I apologize for mis-spelling your name. (I hate when the
keys I press don't press! Ugh!)
 
Ron,
The function is NOT designed to be used as a worksheet function, but
rather by VBA as follows...

If Not StripDupes Then MsgBox "Error!" Else RunSomeOtherCode
 
Thanks for catching the undeclared var. I forgot to modify the line
using it as intended. (Too old to work past being tired anymore!<g>)

I'm pleased that it performs nearly as well as yours did (assuming
tests were same). Here's a revised version prefaced by example usage:

Sub DoStuff()
If StripDupes then Call RunSomeProcess
End Sub 'DoStuff

Function StripDupes(Optional AllowDupes As Boolean = True) As Boolean
' Compares colA to colB and removes colA matches found in colB.
' Args In: AllowDupes: True by default. Keeps duplicate values
' found in colA that are not found in colB. If False,
' duplicate values in colA not found in colB are removed.
'
' Returns: True if matches found and no error occurs;
' False if matches not found --OR-- error occurs.
'
' Sources: Ron Rosenfeld, Jim Cone, Garry Sansom

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut() 'as variant

vRngA = Range("A1:A" & Cells(Rows.Count, "A").End(xlUp).Row)
vRngB = Range("B1:B" & Cells(Rows.Count, "B").End(xlUp).Row)

Debug.Print Now()
Dim dRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
dRngB.Add Key:=CStr(vRngB(j, 1)), Item:=CStr(vRngB(j, 1))
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If dRngB.Item(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
dRngB.Add Key:=CStr(vRngA(i, 1)), Item:=CStr(vRngA(i, 1))
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

Range("A:A").ClearContents
Range("A1").Resize(UBound(vRngOut), 1) = vRngOut
Debug.Print Now()

ErrExit:
If lMatchesFound = 0 Then
StripDupes = False: MsgBox "No matches were found"
Else
StripDupes = (Err = 0)
End If 'lMatchesFound = 0
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()
 
After a bit more thought I decided the StripDupes function should NOT
include any error notification from within, so this can be handled by
the caller...

Caller:
Sub Test_StripDupes()
Dim bSuccess As Boolean, lMatchesFound& 'as long

bSuccess = StripDupes(lMatchesFound) '//allow dupes in new list
' bSuccess = StripDupes(lMatchesFound, False) '//no dupes in new list

Select Case bSuccess
Case Is = False
If lMatchesFound = 0 Then MsgBox "No matches found!" _
Else MsgBox "An error occured!"

Case Is = True
If lMatchesFound = 0 Then
MsgBox "No matches found!"
Else
MsgBox Format(CStr(lMatchesFound), "#,##0") _
& " Matches were found"
'Code goes here to call some other process to act on new list
End If 'lMatchesFound = 0
End Select 'Case bSuccess
End Sub

Results:
True call on new data: "196,484 matches found"
Repeat True call on above call's list: "No matches were found"
False call on above call's list: "64,495 matches were found"
Repeat False call on above call's list: "No matches were found"
Repeat True call on above call's list: "No matches were found"

**Note that the matches found on the False call are additional after
running the True call first. Otherwise, running the False call first
would have returned the sum of both matches found**


Revised function:
Function StripDupes(Matches As Long, _
Optional AllowDupes As Boolean = True) As Boolean
' Compares colA to colB and removes colA matches found in colB.
'
' Args In: Matches: ByRef var to return number of matches found to
' the caller.
'
' AllowDupes: True by default. Keeps duplicate values
' found in colA that are not found in colB. If False,
' duplicate values in colA not found in colB are removed.
'
' Returns: True if matches found and no error occurs;
' False if matches not found --OR-- error occurs.
'
' Sources: Ron Rosenfeld, Jim Cone, Garry Sansom

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut() 'as variant

vRngA = Range("A1:A" & Cells(Rows.Count, "A").End(xlUp).Row)
vRngB = Range("B1:B" & Cells(Rows.Count, "B").End(xlUp).Row)

Debug.Print Now()
Dim dRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
dRngB.Add Key:=CStr(vRngB(j, 1)), Item:=CStr(vRngB(j, 1))
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If dRngB.Item(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
dRngB.Add Key:=CStr(vRngA(i, 1)), Item:=CStr(vRngA(i, 1))
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

Range("A:A").ClearContents
Range("A1").Resize(UBound(vRngOut), 1) = vRngOut
Debug.Print Now()

ErrExit:
Matches = lMatchesFound: StripDupes = (Err = 0)
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()
 
I've condensed the test caller procedure as follows...

Sub Test_StripDupes()
Dim bSuccess As Boolean, lMatchesFound& 'as long

bSuccess = StripDupes(lMatchesFound) '//allow dupes in new list
' bSuccess = StripDupes(lMatchesFound, False) '//no dupes in new list

If lMatchesFound = 0 Then MsgBox "No matches found!": Exit Sub
If Not bSuccess Then
MsgBox "An error occured!"
Else
MsgBox Format(CStr(lMatchesFound), "#,##0") _
& " Matches were found"
'Code goes here to call some other process to act on new list
End If 'Not bSuccess
End Sub
 
My understanding is that doesn't matter. Your Function is still trying to
modify worksheet cells, and that is probably where the error is coming from.
Do you actually see the worksheet cells being written to when you run that
function?

Yes, the results edit the original list (colA).
Is this issue about my function modifying cells an unspoken or
undocumented rule? How are you using the function?
 
Ron Rosenfeld used his keyboard to write :
My understanding is that doesn't matter. Your Function is still trying to
modify worksheet cells, and that is probably where the error is coming from.
Do you actually see the worksheet cells being written to when you run that
function?

My understanding of a function is that it's used when a return is
needed. There is no difference, otherwise, between a function and a
sub.

My understanding of a UDF is that it can't modify cells if called from
a worksheet cell formula. This definitely DOES NOT apply to VBA
functions called by VBA procedures (function or sub).
 
Please change to the 'fixed' version I posted (time stamped 10:06:54
AM)
which was my rep;y to your 2nd listed post time stamped 7:53:31 AM.
 
GS brought next idea :
Please change to the 'fixed' version I posted (time stamped 10:06:54 AM)
which was my rep;y to your 2nd listed post time stamped 7:53:31 AM.

That would be my 2nd reply to your 2nd listed post.
 
In an effort to make this function more 'functional', I've modified it
so the user can specify the col to remove dupes from along with the col
to check AND the col where to put the revised list. This should qualify
this as a reusable utility users can run from PERSONAL.XLS or a
utilities addin if they have one.

The caller routine:
Sub CompareCols_StripDupes()
Dim bSuccess As Boolean, lMatchesFound& 'as long
Dim vAns As Variant, sMsg As String

sMsg = _
"Do you want to remove any duplicate items in the non-matches?" _
& vbLf & "(Doing so will return a list of unique items)"
vAns = MsgBox(sMsg, vbYesNo + vbQuestion)

If vAns = vbNo Then
bSuccess = StripDupes(lMatchesFound) '//dupes allowed
Else
bSuccess = StripDupes(lMatchesFound, False) '//no dupes allowed
End If 'vAns = vbNo

If lMatchesFound = 0 Then MsgBox "No matches found!": Exit Sub
If lMatchesFound < 0 Then
sMsg = "Both columns must have more than 1 item!" _
& vbLf & vbLf & "Please try again: specify different columns!"
MsgBox sMsg, vbExclamation
Exit Sub
End If 'lMatchesFound < 0

If bSuccess Then
sMsg = Format(CStr(lMatchesFound), "#,##0") _
& " Matches were found"
If vAns = vbYes Then _
sMsg = sMsg & " (including non-match duplicates)"
MsgBox sMsg
'Code goes here to call some other process to act on new list
Else
MsgBox "An error occured!"
End If 'bSuccess
End Sub


The new StripDupes() function:
Function StripDupes(Matches As Long, _
Optional AllowDupes As Boolean = True) As Boolean
' Compares 2 user-specified cols and removes matches found.
' User can also specific target col to receive revised list.
'
' Args In: Matches: ByRef var to return number of matches found to
' the caller.
'
' AllowDupes: True by default. Keeps duplicate non-match
' values in col to remove dupes from. If passing False,
' duplicate items in non-match col are removed.
'
' Returns: True if matches found and no error occurs;
' False if a: matches not found --OR-- error occurs;
' b: either input col has less than 2 items.
'
' Sources: Ron Rosenfeld, Jim Cone, Garry Sansom
''''''''''

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut(), vAns 'as variant
Dim sRngOut As String

'Get the label of the columns to act on
Const sMsg As String = "Please enter the label of the column"
'Column to filter
vAns = Application.InputBox(sMsg _
& " to remove duplicates from", Type:=2)
If Not vAns Or vAns = "" Then Exit Function
vRngA = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)
sRngOut = vAns '//output goes here unless specified below
'Column to be checked
vAns = Application.InputBox(sMsg _
& " to check for duplicates", Type:=2)
If Not vAns Or vAns = "" Then Exit Function
vRngB = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)

'Make sure lists contain more than 1 item
If Not IsArray(vRngA) Or Not IsArray(vRngB) Then _
Matches = -1: Exit Function

'Column to receive the results
vAns = Application.InputBox(sMsg _
& "where the new list is to go" & vbLf _
& "(Leave blank or click 'Cancel' to use column " _
& UCase$(sRngOut) & ")", Type:=2)
If Not vAns Or vAns = "" Then sRngOut = sRngOut _
Else sRngOut = vAns

'Debug.Print Now()
Dim dRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
dRngB.Add Key:=CStr(vRngB(j, 1)), Item:=CStr(vRngB(j, 1))
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If dRngB.Item(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
dRngB.Add Key:=CStr(vRngA(i, 1)), Item:=CStr(vRngA(i, 1))
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

Range(sRngOut & ":" & sRngOut).ClearContents
With Range(sRngOut & "1").Resize(UBound(vRngOut), 1)
.NumberFormat = "0000000000000": .Value = vRngOut:
..EntireColumn.AutoFit
End With
'Debug.Print Now()

ErrExit:
Matches = lMatchesFound: StripDupes = (Err = 0)
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()
 
Just a text wrap fix near end of the function where it writes to the
output column...

Function StripDupes(Matches As Long, _
Optional AllowDupes As Boolean = True) As Boolean
' Compares 2 user-specified cols and removes matches found.
' User can also specific target col to receive revised list.
'
' Args In: Matches: ByRef var to return number of matches found to
' the caller.
'
' AllowDupes: True by default. Keeps duplicate non-match
' values in col to remove dupes from. If passing False,
' duplicate items in non-match col are removed.
'
' Returns: True if matches found and no error occurs;
' False if a: matches not found --OR-- error occurs;
' b: either input col has less than 2 items.
'
' Sources: Ron Rosenfeld, Jim Cone, Garry Sansom

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut(), vAns 'as variant
Dim sRngOut As String

'Get the label of the columns to act on
Const sMsg As String = "Please enter the label of the column"
'Column to filter
vAns = Application.InputBox(sMsg _
& " to remove duplicates from", Type:=2)
If Not vAns Or vAns = "" Then Exit Function
vRngA = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)
sRngOut = vAns '//output goes here unless specified below
'Column to be checked
vAns = Application.InputBox(sMsg _
& " to check for duplicates", Type:=2)
If Not vAns Or vAns = "" Then Exit Function
vRngB = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)

'Make sure lists contain more than 1 item
If Not IsArray(vRngA) Or Not IsArray(vRngB) Then _
Matches = -1: Exit Function

'Column to receive the results
vAns = Application.InputBox(sMsg _
& "where the new list is to go" & vbLf _
& "(Leave blank or click 'Cancel' to use column " _
& UCase$(sRngOut) & ")", Type:=2)
If Not vAns Or vAns = "" Then sRngOut = sRngOut _
Else sRngOut = vAns

'Debug.Print Now()
Dim dRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
dRngB.Add Key:=CStr(vRngB(j, 1)), Item:=CStr(vRngB(j, 1))
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If dRngB.Item(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
dRngB.Add Key:=CStr(vRngA(i, 1)), Item:=CStr(vRngA(i, 1))
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

Range(sRngOut & ":" & sRngOut).ClearContents
With Range(sRngOut & "1").Resize(UBound(vRngOut), 1)
.NumberFormat = "0000000000000"
.Value = vRngOut
.EntireColumn.AutoFit
End With
'Debug.Print Now()

ErrExit:
Matches = lMatchesFound: StripDupes = (Err = 0)
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()
 
Garry and Ron,
I was going to put my 2cents worth in here when appropriate, but discovered when testing the Dupes
code yesterday that MS created a problem in XL2010 with SpecialCells.
It causes a large delay in processing (sometimes minutes) while SpecialCells attempts to return an
answer. I discovered on the "Excel for Developers" website that is an admitted issue without a fix.
While trying to come up with a workaround for Special Cells, I discovered other unrelated code
problems.

Anyway, still working on the above and am undecided whether I should spend my time on something
worthwhile (maybe bowling).<g>
While I taking a breather, thought I would pass along a couple of items to consider when using
collections...
---
Dim dRngB As New Collection

is not as efficient as...

Dim dRngB As Collection
Set dRngB = New Collection

Apparently, there are some repetitive internal checks the first construct causes.
---
Also, you should find code is faster when adding to the collection, if you fill the Item with
vbNullstring.
The collection is just being used to dump duplicates and you can iterate the Keys just as easy as
the Items.
'---
Jim Cone






"GS" <[email protected]>
wrote in message
 
After serious thinking Jim Cone wrote :
Garry and Ron,
I was going to put my 2cents worth in here when appropriate, but discovered
when testing the Dupes code yesterday that MS created a problem in XL2010
with SpecialCells.
It causes a large delay in processing (sometimes minutes) while SpecialCells
attempts to return an answer. I discovered on the "Excel for Developers"
website that is an admitted issue without a fix.
While trying to come up with a workaround for Special Cells, I discovered
other unrelated code problems.

Not sure why this would be an issue since my code doesn't use
SpecialCells. What is the offending code?
Anyway, still working on the above and am undecided whether I should spend my
time on something worthwhile (maybe bowling).<g>
While I taking a breather, thought I would pass along a couple of items to
consider when using collections...
---
Dim dRngB As New Collection

is not as efficient as...

Dim dRngB As Collection
Set dRngB = New Collection

Apparently, there are some repetitive internal checks the first construct
causes.

I'm not aware of this but will look into it. I'm just doing what I've
seen done in VB6. Collection is a built-in object class and so we
should be able to do it either way because all we're doing is creating
an instance of an existing object <AFAIK>.

I can see where this might be true for an external object like the
Scripting.Dictionary because VBA needs to verify a ref to that object.
I could be totally wrong but don't think this happens when we
instantiate intrinsic objects (or custom objects defined in a cls).
---
Also, you should find code is faster when adding to the collection, if you
fill the Item with vbNullstring.
The collection is just being used to dump duplicates and you can iterate the
Keys just as easy as the Items.

I assume you're saying to fill the Key with vbNullString since we need
the Item for the test? Or, are you suggesting we fill Item with a
vbNullString and use Key for the test? I'm not sure why we should
change it since both need to be populated.
 
Jim Cone explained on 1/18/2012 :
Chip Pearson has some comments on "Don't Use Auto-Instancing Object
Variables" at
http://www.cpearson.com/excel/DeclaringVariables.aspx

A Collection object uses the Key to make its decisions. The Item is just
along for the ride.
You can stick almost anything into the Item.

Thanks, Jim.
I looked at Chip's article and I agree with it without reservation. I
reiterate, though, my comments regarding non-intrinsic objects since
his comments ref the external Scripting lib. Nonetheless, since we do
not work with the Collection object in the ways he points out, my
position still stands for using auto-instancing because the object
doesn't get instantiated until needed in the function and doesn't
persist to exist outside the function. I guess it's a matter of knowing
when it's okay to use auto-instancing and when not to use it. I suppose
it's also good practice to be consistent in how one handles this,
though in my world there's room for variable consistency<g> when
warranted. In the context of this function the Collection is also
auto-destroyed and so no need to add extra Set statements<IMO>. I'm a
strong supporter of explicitly destroying any objects we explicitly
create, and so...

Dim cRngB As Collection
Set cRngB = New Collection

...should be explicitly destroyed as a point of 'good practice' when
we're done with it...

Set cRngB = Nothing

...so VBA doesn't have to do the extra processing involved with implicit
destruction of the object variable. Not a big deal on a one-by-one
basis but the performance effect can be accumulative over the life of a
project's runtime.

So the variable consistency I use is...

Create explicitly; destroy explicitly
Auto-instance; auto-destroy
--


I'm okay with the way the Key/Item is handled since both are required
inputs and so must be some value when added. I don't see any advantage
in swapping the same value as Key with vbNullString for Item. I could,
however, swap using Item for the check to using Key since Key should be
unique while Item can be anything. I realize we use the same value for
both but the logical test in the real world will 'usually' be made on
Key if known, Item otherwise. Ron's implementation is good either way
IMO.
 
Jim Cone explained :
Also, you should find code is faster when adding to the collection, if you
fill the Item with vbNullstring.
The collection is just being used to dump duplicates and you can iterate the
Keys just as easy as the Items.

After giving this more thought I decided to try using vbNullString just
to eliminate the 2 CStr() functions. It improved performance by 1 sec,
which is 12.5% based on the time using the 2 CStr() functions.

As stated previously, at first I didn't think it would be an advantage
but forgot I had to use CStr() because I deliberately left the cells
numeric so they'd be usable in formulas/calcs. (There was no need for
CStr() using Dictionary) Good catch, Jim! ..thanks for pointing this
out!

<FWIW>
I tested Chip's theory about being able to test an auto-instance object
and found a discrepancy in the results. Unless I misread his comments,
we can work with an auto-instance object same as an explicitly created
object using the Set statement.
 
Thanks Jim & Ron for your feedback and helpful input. I modified this
project as follows...

The caller sub:
--I added an optional notification that asks the user if they want to
run a process on a returned list when the number of matches is
reported. This can be swapped out for the former notification via
'Commenting'. (Code for the process to call needs to be added as
appropriate to user's needs)

The StripDupes() function:
--In the case where no matches are found, the function makes no changes
to the worksheet.
--Iteration of the Collection acts on 'Key'.
--The 2 CStr() functions for adding values to 'Item' were replaced with
'vbNullString'


Final drafts...

Sub CompareCols_StripDupes()
Dim bSuccess As Boolean, lMatchesFound As Long
Dim vAns As Variant, sMsg As String

sMsg = _
"Do you want to remove any duplicate items in the non-matches?" _
& vbLf & vbLf & "(Doing so will return a list of unique items)"
vAns = MsgBox(sMsg, vbYesNo + vbQuestion)

If vAns = vbNo Then
bSuccess = StripDupes(lMatchesFound) '//dupes allowed
Else
bSuccess = StripDupes(lMatchesFound, False) '//no dupes allowed
End If 'vAns = vbNo

If lMatchesFound = 0 Then MsgBox "No matches found!": Exit Sub
If lMatchesFound < 0 Then
sMsg = "Both columns must have more than 1 item!" _
& vbLf & vbLf _
& "Please try again: specify different columns!"
MsgBox sMsg, vbExclamation
Exit Sub
End If 'lMatchesFound < 0

If bSuccess Then
sMsg = Format(CStr(lMatchesFound), "#,##0") _
& " Matches were found"
If vAns = vbYes Then _
sMsg = sMsg & " (including non-match duplicates)"
MsgBox sMsg '//comment out if using option below

'Optional: Ask to run a process on the new list
' sMsg = sMsg & vbLf & vbLf _
' & "Do you want to process the new list?"
'
' vAns = MsgBox(sMsg, vbYesNo + vbQuestion)
' If vAns = vbYes Then
' 'Code... ('Call' a process to act on the new list)
' End If 'vAns = vbYes
Else
MsgBox "An error occured!"
End If 'bSuccess
End Sub

Function StripDupes(Matches As Long, _
Optional AllowDupes As Boolean = True) As Boolean
' Compares 2 user-specified cols and removes matches found.
' User can also specific target col to receive revised list.
'
' Args In: Matches: ByRef var to return number of matches found to
' the caller.
'
' AllowDupes: True by default. Keeps duplicate non-match
' values in col to remove dupes from. If passing False,
' duplicate items in non-match col are removed.
'
' Returns: True if matches found and no error occurs;
' False if a: matches not found --OR-- error occurs;
' b: either input col has less than 2 items.
'
' Sources: Ron Rosenfeld, Jim Cone, GS (Garry Sansom)

Dim i&, j&, lMatchesFound& 'as long
Dim vRngA, vRngB, vRngOut(), vAns 'as variant
Dim sRngOut As String

'Get the label of the columns to act on
Const sMsg As String = "Please enter the label of the column"
'Column to filter
vAns = Application.InputBox(sMsg _
& " to remove duplicates from", Type:=2)
If vAns = False Or vAns = "" Then Exit Function
vRngA = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)
sRngOut = vAns '//output goes here unless specified below
'Column to be checked
vAns = Application.InputBox(sMsg _
& " to check for duplicates", Type:=2)
If vAns = False Or vAns = "" Then Exit Function
vRngB = Range(vAns & "1:" & vAns _
& Cells(Rows.Count, vAns).End(xlUp).Row)

'Make sure lists contain more than 1 item
If Not IsArray(vRngA) Or Not IsArray(vRngB) Then _
Matches = -1: Exit Function

'Column to receive the results
vAns = Application.InputBox(sMsg _
& "where the new list is to go" & vbLf _
& "(Leave blank or click 'Cancel' to use column " _
& UCase$(sRngOut) & ")", Type:=2)
If vAns = False Or vAns = "" Then sRngOut = sRngOut _
Else sRngOut = vAns

Debug.Print Now()
Dim cRngB As New Collection
On Error Resume Next
For j = LBound(vRngB) To UBound(vRngB)
cRngB.Add Key:=CStr(vRngB(j, 1)), Item:=vbNullString
Next 'j
Err.Clear: On Error GoTo ErrExit

If AllowDupes Then '//fastest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
If cRngB.Key(CStr(vRngA(i, 1))) <> "" Then _
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1
skipit:
Next 'i

Else '//slowest
On Error GoTo MatchFound
For i = LBound(vRngA) To UBound(vRngA)
cRngB.Add Key:=CStr(vRngA(i, 1)), Item:=vbNullString
Next 'i
End If 'AllowDupes
Err.Clear: On Error GoTo ErrExit

j = 0: ReDim vRngOut(UBound(vRngA) - lMatchesFound, 0)
For i = LBound(vRngA) To UBound(vRngA)
If Not vRngA(i, 1) = "" Then _
vRngOut(j, 0) = vRngA(i, 1): j = j + 1
Next 'i

If lMatchesFound > 0 Then '//only write if lMatchesFound > 0
Range(sRngOut & ":" & sRngOut).ClearContents
With Range(sRngOut & "1").Resize(UBound(vRngOut), 1)
.Value = vRngOut
.NumberFormat = "0000000000000" '//optional
.EntireColumn.AutoFit '//optional
End With
End If 'lMatchesFound > 0

Debug.Print Now()
ErrExit:
Matches = lMatchesFound: StripDupes = (Err = 0)
Exit Function

MatchFound:
If AllowDupes Then Resume skipit
vRngA(i, 1) = "": lMatchesFound = lMatchesFound + 1: Resume Next

End Function 'StripDupes()

Enjoy!
 
I've reworked this utility as follows:

A bug in the AllowDupes feature is fixed
Prompts have been moved to the caller
Parameters are passed to the function via an array
Function supports:
- removing matches or non-matches
- returning a list with or without duplicate values


<code> - watch for line wrapping

The new caller:
Sub CompareCols_FilterMatches()
Dim bSuccess As Boolean, lMatchesFound As Long
Dim vAns As Variant, vCriteria(5) As Variant, sMsg As String

'Get the label of the columns to act on
Const MSG As String = "Please enter the label of the column"

tryagain:
'Column to filter
sMsg = MSG & " to be filtered": vAns = Application.InputBox(sMsg,
Type:=2)
If vAns = False Or vAns = "" Then Beep: Exit Sub
vCriteria(0) = Range(vAns & "1:" & vAns & Cells(Rows.Count,
vAns).End(xlUp).Row).Address
'Output goes in the column being filtered unless specified otherwise
below
vCriteria(2) = UCase$(vAns)

'Column to be checked
sMsg = MSG & " to check for matches": vAns =
Application.InputBox(sMsg, Type:=2)
If vAns = False Or vAns = "" Then Beep: Exit Sub
vCriteria(1) = Range(vAns & "1:" & vAns & Cells(Rows.Count,
vAns).End(xlUp).Row).Address

'Make sure lists contain more than 1 item
If Not Range(vCriteria(0)).Cells.Count > 1 _
Or Not Range(vCriteria(1)).Cells.Count > 1 Then
sMsg = "Columns MUST have more than one value!" & vbLf & vbLf
sMsg = sMsg & "Please try again with a different set of
columns"
MsgBox sMsg, vbCritical: GoTo tryagain
End If

'Column to receive the results
sMsg = MSG & "where the new list is to go" & vbLf _
& "(Leave blank or click 'Cancel' to use column '" &
vCriteria(2) & "')"
vAns = Application.InputBox(sMsg, Type:=2)
If Not (vAns = False) And (vAns <> "") Then vCriteria(2) =
UCase$(vAns)

'Return or remove matches?
sMsg = "Do you want to return the matches found instead of removing
them?"
vAns = MsgBox(sMsg, vbYesNo + vbQuestion)
If (vAns = vbYes) Then vCriteria(3) = 1 Else vCriteria(3) = 0

'Return a unique list?
sMsg = "Do you want only unique items in the returned list?" & vbLf &
vbLf & "(No duplicates)"
vAns = MsgBox(sMsg, vbYesNo + vbQuestion) '//YES = no dupes allowed
If (vAns = vbYes) Then vCriteria(4) = 0 Else vCriteria(4) = 1
bSuccess = FilterMatches(lMatchesFound, vCriteria())

If lMatchesFound = 0 Then MsgBox "No matches found!": Exit Sub
If lMatchesFound < 0 Then
sMsg = "Both columns must have more than 1 item!"
sMsg = sMsg & vbLf & vbLf & "Please try again: specify different
columns!"
MsgBox sMsg, vbExclamation: Exit Sub
End If 'lMatchesFound < 0

If bSuccess Then
sMsg = Format(CStr(lMatchesFound), "#,##0") _
& " Matches were found"
If vAns = vbYes Then _
sMsg = sMsg & " (including non-match duplicates)"
MsgBox sMsg '//comment out if using option below

'Optional: Ask to run a process on the new list
' sMsg = sMsg & vbLf & vbLf _
' & "Do you want to process the new list?"
'
' vAns = MsgBox(sMsg, vbYesNo + vbQuestion)
' If vAns = vbYes Then
' 'Code... ('Call' a process to act on the new list)
' End If 'vAns = vbYes
Else
MsgBox "An error occured!"
End If 'bSuccess
End Sub


The new function:
Function FilterMatches(Matches As Long, Criteria() As Variant) As
Boolean
' Compares 2 user-specified cols and filters matches found.
' User can also specific target col to receive resulting list.
' Optionally supports returning a unique list or allow duplicates.
' Optionally supports returning matches or non-matches.
'
' Args In: Matches: ByRef var to return number of matches found to
the caller.
'
' vCriteria(): A variant array containing the filtering
parameters.
' Criteria(0) - Address of the values to be filtered
' Criteria(1) - Address of the values to check
' Criteria(2) - Label of the column to put the filtered
list
' Criteria(3) - Numeric value to determine if we return
matches or non-matches
' Criteria(4) - Numeric value to determine if we return a
unique list or allow dupes
'
' Returns: True if matches found and no error occurs;
' False if a: matches not found --OR-- error occurs;

Dim i&, j& 'as long
Dim vFilterRng, vCheckRng, vResult, vaMatches(), vaNoMatches(),
vaDataOut() 'as variant
Dim bReturnMatches As Boolean, bMatch As Boolean, bDupesAllowed As
Boolean
Dim cItemsToCheck As New Collection, sMsg$, sRngOut$ 'as string

'Load the filtering criteria
vFilterRng = Range(Criteria(0)): vCheckRng = Range(Criteria(1)):
sRngOut = Criteria(2)
bReturnMatches = (Criteria(3) = 1): bDupesAllowed = (Criteria(4) = 1)
ReDim vaMatches(UBound(vFilterRng)): ReDim
vaNoMatches(UBound(vFilterRng)): j = 0

'Load the Collection with the values to be checked.
'Collections only allow unique keys so use OERN (no need to check if
they already exist)
Set cItemsToCheck = New Collection: On Error Resume Next
For i = LBound(vCheckRng) To UBound(vCheckRng)
cItemsToCheck.Add Key:=CStr(vCheckRng(i, 1)), Item:=vbNullString
Next 'i
Err.Clear

'Check the Collection for matches
On Error GoTo MatchFound
For i = LBound(vFilterRng) To UBound(vFilterRng)
bMatch = False '..reset
cItemsToCheck.Add Key:=CStr(vFilterRng(i, 1)), Item:=vbNullString
If bMatch Then
If bReturnMatches Then vaMatches(j) = vFilterRng(i, 1): j = j + 1
Else
vaNoMatches(j) = vFilterRng(i, 1): j = j + 1
cItemsToCheck.Remove CStr(vFilterRng(i, 1)) '..so dupes of it
don't get counted
End If 'bMatch
Next 'i

'Initialize the return list
If bReturnMatches Then vResult = vaMatches Else vResult = vaNoMatches

'Return a list of unique values?
If Not bDupesAllowed Then
On Error GoTo UniqueList
Dim cUniqueList As New Collection
For i = LBound(vResult) To UBound(vResult)
cUniqueList.Add Key:=CStr(vResult(i)), Item:=vbNullString
Next 'i
End If 'Not bDupesAllowed
Err.Clear: On Error GoTo ErrExit

'Make the list to return contiguous.
ReDim vaDataOut(UBound(vResult), 0): j = 0
For i = LBound(vResult) To UBound(vResult)
If Not vResult(i) = "" Then vaDataOut(j, 0) = vResult(i): j = j + 1
Next 'i

If Matches > 0 Then '..only write if Matches > 0
Columns(sRngOut).ClearContents
With Range(sRngOut & "1").Resize(UBound(vaDataOut) + 1, 1)
.Value = vaDataOut
.NumberFormat = "0000000000000" '..optional
.EntireColumn.AutoFit '..optional
End With
End If 'Matches > 0


ErrExit:
' If bReturnMatches Then Matches = UBound(vResult) ' + 1
FilterMatches = (Err = 0): Exit Function

MatchFound:
bMatch = True: Matches = Matches + 1: Resume Next

UniqueList:
vResult(i) = "": Matches = Matches + 1: Resume Next

End Function 'FilterMatches()
 
I forgot to post the performance details...

There are 4 possible results using this utility:

1. Returns a list of matches with duplicates
2. Returns a list of non-matches with duplicates
3. Returns a unique list of matches (no duplicates)
4. Returns a unique list of non-matches

You can choose where to put the returned list. Tested on two cols x
500,000 rows of data, depending on which return options are selected
the new list generated in about 10 to 12 seconds. This might improve if
Calculation/EnableEvents/ScreenUpdating are toggled off/on, but I doubt
by much since the return list gets 'dumped' into the worksheet in one
shot. This produces a slight flicker that's reasonably acceptible IMO.
 
Back
Top