Search code examples
c++searchboostboost-filesystem

Search partial filenames in C++ using boost filesystem


the question is simple , I want to find a file path inside a directory but I have only part of the filename, so here is a functions for this task

void getfiles(const fs::path& root, const string& ext, vector<fs::path>& ret)
{
    if(!fs::exists(root) || !fs::is_directory(root)) return;

    fs::recursive_directory_iterator it(root);
    fs::recursive_directory_iterator endit;
    while(it != endit)
    {
        if(fs::is_regular_file(*it)&&it->path().extension()==ext) ret.push_back(it->path());//
        ++it;

    }

}

bool find_file(const filesystem::path& dir_path, const filesystem::path file_name, filesystem::path& path_found) {
    const fs::recursive_directory_iterator end;
    const auto it = find_if(fs::recursive_directory_iterator(dir_path), end,
                            [file_name](fs::path e) {


            cerr<<boost::algorithm::icontains(e.filename().native() ,file_name.native())<<endl;
            return boost::algorithm::icontains(e.filename().native() ,file_name.native());//
});

    if (it == end) {
        return false;
    } else {
        path_found = it->path();
        return true;
    }
}


int main (int argc, char* argv[]) 
{
    vector<fs::path> inputClass ;
    fs::path textFiles,datasetPath,imgpath;
    textFiles=argv[1];
    datasetPath=argv[2];

    getfiles(textFiles,".txt",inputClass);
    
    for (int i=0;i<inputClass.size();i++)
        
    {
        ifstream lblFile(inputClass[i].string().c_str());
        string line;
        fs::path classname=inputClass[i].parent_path()/inputClass[i].stem().string();
        cerr<<classname.stem()<<endl;
        while (getline(lblFile,line))
        {
            
            bool find=find_file(datasetPath,line,imgpath);
            if (find)
            {
                
                while(!fs::exists(classname))
                    fs::create_directories (classname);
                fs::copy(imgpath,classname/imgpath.filename());
                cerr<<"Found\n";
            }
            else
                cerr<<"Not Found \n";
            
            
        }
        lblFile.close();
    }
    
    
}

Console out:

"490"
vfv343434.jpeg||E9408000EC0
0
fsdfdsfdfsf.jpeg||E9408000EC0
0
1200E9408000EC0.jpeg||E9408000EC0
0
Not Found 

but when I set the search string manually it works fine ! I tried other methods for searching string like std::find but all the methods fail to find the substring, it seems there is problem with input string (line) I printed all the chars but no especial characters or anything. if I set the search string manually it works as desired

string search="E9408000EC0";
        cerr<<e.filename().native()<<"||"<<search<<endl;
        cerr<<boost::algorithm::icontains(e.filename().native() ,search)<<endl;

 

the results for above change is like

"490"
vfv343434.jpeg||E9408000EC0
0
fsdfdsfdfsf.jpeg||E9408000EC0
0
1200E9408000EC0.jpeg||E9408000EC0
1
Found

Solution

  • I cannot reproduce this.

    The only hunch I have is that on your platform, perhaps the string() accessor is not returning the plain string, but e.g. the quoted path. That would break the search. Consider using the native() accessor instead.

    (In fact, since file_name is NOT a path, but a string pattern, suggest passing the argument as std::string__view or similar instead.)

    Live On Coliru

    #include <boost/filesystem.hpp>
    #include <boost/algorithm/string.hpp>
    #include <iostream>
    namespace fs = boost::filesystem;
    
    template <typename Out>
    void find_file(const fs::path& dir_path, const fs::path file_name, Out out) {
        fs::recursive_directory_iterator it(dir_path), end;
        std::copy_if(it, end, out, [file_name](fs::path e) {
            return boost::algorithm::icontains(e.filename().native(),
                                               file_name.native());
        });
    }
    
    int main() {
        fs::path d = "a/b/c/e";
        fs::create_directories(d);
        {
            std::ofstream ofs(d / "1200E9408000EC0.jpeg");
        }
    
        std::cout << fs::path("000EC0").native() << "\n";
    
        std::vector<fs::path> found;
        find_file(".", "000EC0", back_inserter(found));
    
        for (auto &f : found)
        {
            std::cout << "Found: " << f << "\n";
        }
    }
    

    Prints

    000EC0
    Found: "./a/b/c/e/1200E9408000EC0.jpeg"
    

    UPDATE: Code Review

    To the updated question, came up with an somewhat improved tester that works with boost::filesystem and with std::filesystem just the same.

    There are many small improvements (removing repetition, explicit conversions, using optional to return optional matches, etc.

    Also added a whitespace trim to avoid choking on extraneous whitespace on the input lines:

    #include <boost/algorithm/string.hpp>
    #include <fstream>
    #include <iostream>
    
    using boost::algorithm::icontains;
    using boost::algorithm::trim;
    
    #if defined(USE_BOOST_FS)
        #include <boost/filesystem.hpp>
        namespace fs = boost::filesystem;
        using boost::system::error_code;
    #else
        #include <filesystem>
        namespace fs = std::filesystem;
        using std::error_code;
    #endif
    
    void getfiles(
        const fs::path& root, const std::string& ext, std::vector<fs::path>& ret)
    {
        if (!exists(root) || !is_directory(root))
            return;
    
        for (fs::recursive_directory_iterator it(root), endit; it != endit; ++it) {
            if (is_regular_file(*it) && it->path().extension() == ext)
                ret.push_back(it->path()); //
        }
    }
    
    std::optional<fs::path> find_file(const fs::path& dir_path, fs::path partial)
    {
        fs::recursive_directory_iterator end,
            it = fs::recursive_directory_iterator(dir_path);
    
        it = std::find_if(it, end, [partial](fs::path e) {
            auto search = partial.native();
            //std::cerr << e.filename().native() << "||" << search << std::endl;
            auto matches = icontains(e.filename().native(), search);
            std::cerr << e << " Matches: " << std::boolalpha << matches
                      << std::endl;
            return matches;
        });
    
        return (it != end)
            ? std::make_optional(it->path())
            : std::nullopt;
    }
    
    auto readInputClass(fs::path const& textFiles)
    {
        std::vector<fs::path> found;
        getfiles(textFiles, ".txt", found);
        return found;
    }
    
    int main(int argc, char** argv)
    {
        std::vector<std::string> const args(argv, argv + argc);
        auto const textFiles = readInputClass(args.at(1));
        std::string const datasetPath = args.at(2);
    
        for (fs::path classname : textFiles) {
            // open the text file
            std::ifstream lblFile(classname);
    
            // use base without extension as output directory
            classname.replace_extension();
            if (!fs::exists(classname)) {
                if (fs::create_directories(classname))
                    std::cerr << classname << " created" << std::endl;
            }
    
            for (std::string line; getline(lblFile, line);) {
                trim(line);
    
                if (auto found = find_file(datasetPath, line)) {
                    auto dest = classname / found->filename();
    
                    error_code ec;
                    copy(*found, dest, ec);
                    std::cerr << dest << " (" << ec.message() << ")\n";
                } else {
                    std::cerr << "Not Found \n";
                }
            }
        }
    }
    

    Testing from scratch with

    mkdir -pv textfiles dataset
    touch dataset/{vfv343434,fsdfdsfdfsf,1200E9408000EC0}.jpeg
    echo 'E9408000EC0 ' > textfiles/490.txt
    

    Running

    ./a.out textfiles/ dataset/
    

    Prints

    "textfiles/490" created
    "dataset/1200E9408000EC0.jpeg" Matches: true
    "textfiles/490/1200E9408000EC0.jpeg" (Success)
    

    Or on subsequent run

    "dataset/fsdfdsfdfsf.jpeg" Matches: false
    "dataset/1200E9408000EC0.jpeg" Matches: true
    "textfiles/490/1200E9408000EC0.jpeg" (File exists)
    

    BONUS

    Doing some more diagnostics and avoiding repeatedly traversing the filesystem for each pattern. The main program is now:

    Live On Coliru

    int main(int argc, char** argv)
    {
        std::vector<std::string> const args(argv, argv + argc);
    
        Paths const classes = getfiles(args.at(1), ".txt");
        Mappings map = readClassMappings(classes);
    
        std::cout << "Procesing " << map.size() << " patterns from "
                  << classes.size() << " classes" << std::endl;
    
        processDatasetDir(args.at(2), map);
    }
    

    And the remaining functions are implemented as:

    // be smart about case insenstiive patterns
    struct Pattern : std::string {
        using std::string::string;
        using std::string::operator=;
    
    #ifdef __cpp_lib_three_way_comparison
        std::weak_ordering operator<=>(Pattern const& other) const {
            if (boost::ilexicographical_compare(*this, other)) {
                return std::weak_ordering::less;
            } else if (boost::ilexicographical_compare(other, *this)) {
                return std::weak_ordering::less;
            }
            return std::weak_ordering::equivalent;
        }
    #else
        bool operator<(Pattern const& other) const {
            return boost::ilexicographical_compare(*this, other);
        }
    #endif
    };
    
    using Paths    = std::vector<fs::path>;
    using Mapping  = std::pair<Pattern, fs::path>;
    using Patterns = std::set<Pattern>;
    using Mappings = std::set<Mapping>;
    
    Mappings readClassMappings(Paths const& classes)
    {
        Mappings mappings;
        for (fs::path classname : classes) {
            std::ifstream lblFile(classname);
            classname.replace_extension();
    
            for (Pattern pattern; getline(lblFile, pattern);) {
                trim(pattern);
                if (auto [it, ok] = mappings.emplace(pattern, classname); !ok) {
                    std::cerr << "WARNING: " << std::quoted(pattern)
                              << " duplicates " << std::quoted(it->first)
                              << std::endl;
                }
            }
        }
    
        return mappings;
    }
    
    size_t processDatasetDir(const fs::path& datasetPath, Mappings const& patterns)
    {
        size_t copied = 0, failed = 0;
        Patterns found;
    
        using It = fs::recursive_directory_iterator;
        for (It it = It(datasetPath), end; it != end; ++it) {
            if (!it->is_regular_file())
                continue;
    
            fs::path const& entry = *it;
    
            for (auto& [pattern, location]: patterns) {
                if (icontains(it->path().filename().native(), pattern)) {
                    found.emplace(pattern);
    
                    if (!exists(location) && fs::create_directories(location))
                        std::cerr << location << " created" << std::endl;
    
                    auto dest = location / entry.filename();
    
                    error_code ec;
                    copy(entry, dest, ec);
                    std::cerr << dest << " (" << ec.message() << ") from "
                              << std::quoted(pattern) << "\n";
    
                    (ec? failed : copied) += 1;
                }
            }
        }
    
        std::cout << "Copied:" << copied
                  << ", missing:" << patterns.size() - found.size()
                  << ", failed: " << failed << std::endl;
        return copied;
    }
    

    With some more "random" test data:

    mkdir -pv textfiles dataset
    touch dataset/{vfv343434,fsdfdsfdfsf,1200E9408000EC0}.jpeg
    echo .jPeg > textfiles/all_of_them.txt
    echo $'E9408000EC0 \n e9408000ec0\nE9408\nbOgUs' > textfiles/490.txt
    

    Running as

    ./a.out textfiles/ dataset/
    

    Prints:

    WARNING: "e9408000ec0" duplicates "E9408000EC0"
    Procesing 4 patterns from 2 classes
    "textfiles/all_of_them" created
    "textfiles/all_of_them/1200E9408000EC0.jpeg" (Success) from ".jPeg"
    "textfiles/490" created
    "textfiles/490/1200E9408000EC0.jpeg" (Success) from "E9408"
    "textfiles/490/1200E9408000EC0.jpeg" (File exists) from "E9408000EC0"
    "textfiles/all_of_them/vfv343434.jpeg" (Success) from ".jPeg"
    "textfiles/all_of_them/fsdfdsfdfsf.jpeg" (Success) from ".jPeg"
    Copied:4, missing:1, failed: 1