Gewusst wie: Abfragen von Dateiduplikaten in einer Verzeichnisstruktur (LINQ)

Aktualisiert: November 2007

Es ist möglich, dass sich Dateien mit demselben Namen in mehr als einem Ordner befinden. Im Visual Studio-Installationsordner weisen zum Beispiel mehrere Ordner eine readme.htm-Datei auf. In diesem Beispiel wird gezeigt, wie solche mehrfach vorkommenden Dateinamen in einem angegebenen Stammordner abgefragt werden. Das zweite Beispiel zeigt, wie Dateien abgefragt werden, deren Größe und Erstellungszeit ebenfalls übereinstimmen.

Beispiel

Module QueryDuplicateFileNames

    Public Sub Main()

        Dim path As String = "C:\Program Files\Microsoft Visual Studio 9.0\Common7"
        'QueryDuplicates1(path)
        ' Uncomment to run this query instead
        QueryDuplicates2(path)

    End Sub
    Sub QueryDuplicates1(ByVal root As String)

        Dim duplicates = From aFile In GetFiles(root) _
                                 Order By aFile.Name _
                                 Group aFile By aFile.Name Into newGroup = Group _
                                 Where newGroup.Count() >= 2 _
                                 Select newGroup

        ' Page the display so that the results can be read.
        Dim trimLength = root.Length
        PageOutput(duplicates, trimLength)

    End Sub
    Sub QueryDuplicates2(ByVal root As String)

        ' This time a composite key is used. This sub finds all files
        ' that have been copied into multiple subfolders.
        Dim duplicates = From aFile In GetFiles(root) _
                                 Order By aFile.Name _
                                 Group aFile By aFile.Name, aFile.CreationTime, aFile.Length Into newGroup = Group _
                                 Where newGroup.Count() >= 2 _
                                 Select newGroup

        ' Page the display so that the results can be read.
        Dim trimLength = root.Length
        PageOutput(duplicates, trimLength)

    End Sub
    ' Pages console diplay for large query results. No more than one group per page.
    ' This sub specifically works with group queries of FileInfo objects
    ' but can be modified for any type.
    Sub PageOutput(ByVal groupQuery, ByVal charsToSkip)

        ' "3" = 1 line for extension key + 1 for "Press any key" + 1 for input cursor.
        Dim numLines As Integer = Console.WindowHeight - 3
        ' Flag to indicate whether there are more results to diplay
        Dim goAgain As Boolean = True

        For Each fg As IEnumerable(Of System.IO.FileInfo) In groupQuery
            ' Start a new extension at the top of a page.
            Dim currentLine As Integer = 0

            Do While (currentLine < fg.Count())
                Console.Clear()

                ' Get the next page of results
                ' No more than one filename per page
                Dim resultPage = From file In fg _
                                Skip currentLine Take numLines

                ' Execute the query. Trim the paths in the output.
                For Each line In resultPage
                    Console.WriteLine(vbTab & line.FullName.Substring(charsToSkip))
                Next

                ' Advance the current position
                currentLine = numLines + currentLine

                ' Give the user a chance to break out of the loop
                Console.WriteLine("Press any key for next page or the 'End' key to exit.")
                Dim key As ConsoleKey = Console.ReadKey().Key
                If key = ConsoleKey.End Then
                    goAgain = False
                    Exit For
                End If
            Loop
        Next
    End Sub

    ' Function to retrieve a list of files. Note that this is a copy
    ' of the file information.
    Function GetFiles(ByVal root As String) As System.Collections.Generic.IEnumerable(Of System.IO.FileInfo)
        Return From file In My.Computer.FileSystem.GetFiles _
                  (root, FileIO.SearchOption.SearchAllSubDirectories, "*.*") _
               Select New System.IO.FileInfo(file)
    End Function
End Module
class QueryDuplicateFileNames
{
    static void Main(string[] args)
    {   
        // Uncomment QueryDuplicates2 to run that query.
        QueryDuplicates();            
        // QueryDuplicates2();

        // Keep the console window open in debug mode.
        Console.WriteLine("Press any key to exit.");
        Console.ReadKey();
    }

    static void QueryDuplicates()
    {
        // Change the root drive or folder if necessary
        string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\";

        // Take a snapshot of the file system.
        IEnumerable<System.IO.FileInfo> fileList = GetFiles(startFolder);

        // used in WriteLine to keep the lines shorter
        int charsToSkip = startFolder.Length;

        // var can be used for convenience with groups.
        var queryDupNames = 
            from file in fileList
            group file.FullName.Substring(charsToSkip) by file.Name into fileGroup
            where fileGroup.Count() > 1
            select fileGroup;

        // Pass the query to a method that will
        // output one page at a time.
        PageOutput<string,string>(queryDupNames);
    }

    // A Group key that can be passed to a separate method.
    // Override Equals and GetHashCode to define equality for the key.
    // Override ToString to provide a friendly name for Key.ToString()
    class PortableKey
    {
        public string Name { get; set; }
        public DateTime CreationTime { get; set; }
        public long Length {get;  set;}

        public override bool Equals(object obj)
        {
            PortableKey other = (PortableKey)obj;
            return other.CreationTime == this.CreationTime &&
                   other.Length == this.Length &&
                   other.Name == this.Name;
        }

        public override int GetHashCode()
        {
            string str = String.Format("{0}{1}{2}", this.CreationTime, this.Length, this.Name);
            return str.GetHashCode();
        }
        public override string ToString()
        {
            return String.Format("{0} {1} {2}", this.Name, this.Length, this.CreationTime);
        }
    }
    static void QueryDuplicates2()
    {
        // Change the root drive or folder if necessary.
        string startFolder = @"c:\program files\Microsoft Visual Studio 9.0\Common7";

        // Make the the lines shorter for the console display
        int charsToSkip = startFolder.Length;

        // Take a snapshot of the file system.
        IEnumerable<System.IO.FileInfo> fileList = GetFiles(startFolder);

        // Note the use of a compound key. Files that match
        // all three properties belong to the same group.
        // A named type is used to enable the query to be
        // passed to another method. Anonymous types can also be used
        // for composite keys but cannot be passed across method boundaries
        // 
        var queryDupFiles = 
            from file in fileList
            group file.FullName.Substring(charsToSkip) by 
                new PortableKey{ Name=file.Name, CreationTime=file.CreationTime, Length=file.Length } into fileGroup
            where fileGroup.Count() > 1
            select fileGroup;

        var list = queryDupFiles.ToList();

        int i = queryDupFiles.Count();

        PageOutput<PortableKey, string>(queryDupFiles);               
    }


    // A generic method to page the output of the QueryDuplications methods
    // Here the type of the group must be specified explicitly. "var" cannot
    // be used in method signatures. This method does not display more than one
    // group per page.
    private static void PageOutput<K,V>(IEnumerable<System.Linq.IGrouping<K, V>> groupByExtList)
    {
        // Flag to break out of paging loop.
        bool goAgain = true;

        // "3" = 1 line for extension + 1 for "Press any key" + 1 for input cursor.
        int numLines = Console.WindowHeight - 3;

        // Iterate through the outer collection of groups.
        foreach (var filegroup in groupByExtList)
        {
            // Start a new extension at the top of a page.
            int currentLine = 0;

            // Output only as many lines of the current group as will fit in the window.
            do
            {
                Console.Clear();
                Console.WriteLine("Filename = {0}", filegroup.Key.ToString() == String.Empty ? "[none]" : filegroup.Key.ToString());

                // Get 'numLines' number of items starting at number 'currentLine'.
                var resultPage = filegroup.Skip(currentLine).Take(numLines);

                //Execute the resultPage query
                foreach (var fileName in resultPage)
                {
                    Console.WriteLine("\t{0}", fileName);
                }

                // Increment the line counter.
                currentLine += numLines;

                // Give the user a chance to escape.
                Console.WriteLine("Press any key to continue or the 'End' key to break...");
                ConsoleKey key = Console.ReadKey().Key;
                if (key == ConsoleKey.End)
                {
                    goAgain = false;
                    break;
                }
            } while (currentLine < filegroup.Count());

            if (goAgain == false)
                break;
        }
    }


    // This method assumes that the application has discovery 
    // permissions for all folders under the specified path.
    static IEnumerable<System.IO.FileInfo> GetFiles(string path)
    {
        if (!System.IO.Directory.Exists(path))
            throw new System.IO.DirectoryNotFoundException();

        string[] fileNames = null;
        List<System.IO.FileInfo> files = new List<System.IO.FileInfo>();

        fileNames = System.IO.Directory.GetFiles(path, "*.*", System.IO.SearchOption.AllDirectories);
        foreach (string name in fileNames)
        {
            files.Add(new System.IO.FileInfo(name));
        }
        return files;
    }
}

Die erste Abfrage verwendet einen einfachen Schlüssel zur Ermittlung von Übereinstimmungen. Auf diese Weise werden Dateien gefunden, die den gleichen Namen haben, deren Inhalt aber unterschiedlich sein könnte. Bei der zweiten Abfrage wird ein zusammengesetzter Schlüssel verwendet, der Übereinstimmungen in drei Eigenschaften des FileInfo-Objekts bestimmt. Mit dieser Abfrage ist es wahrscheinlicher, Dateien zu finden, die den gleichen Namen sowie einen ähnlichen oder identischen Inhalt haben.

Kompilieren des Codes

  • Erstellen Sie ein Visual Studio-Projekt, das die .NET Framework-Version 3.5 als Ziel hat. Standardmäßig weist das Projekt einen Verweis auf System.Core.dll und eine using-Direktive (C#) oder einen importierten Namespace (Visual Basic) für den System.Linq-Namespace auf. Fügen Sie in C#-Projekten eine using-Direktive für den System.IO-Namespace hinzu.

  • Kopieren Sie diesen Code in Ihr Projekt.

  • Drücken Sie F5, um das Programm zu kompilieren und auszuführen.

  • Drücken Sie eine beliebige Taste, um das Konsolenfenster zu schließen.

Robuste Programmierung

Für umfassende Abfragevorgänge über die Inhalte mehrerer Arten von Dokumenten und Dateien empfiehlt es sich, die Windows-Desktopsuche zu verwenden.

Siehe auch

Konzepte

LINQ to Objects

LINQ und Dateiverzeichnisse