add word analyzer
This commit is contained in:
parent
e64ee43824
commit
bb41e30cba
@ -49,6 +49,10 @@ FetchContent_Declare(
|
||||
)
|
||||
FetchContent_MakeAvailable(magic_enum)
|
||||
|
||||
message(STATUS "[DEPS] Processing tbb")
|
||||
find_package(TBB REQUIRED COMPONENTS tbb)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
message(STATUS "[DEPS] Processing generator")
|
||||
file(DOWNLOAD
|
||||
"https://raw.githubusercontent.com/lewissbaker/generator/master/include/__generator.hpp"
|
||||
@ -62,10 +66,13 @@ set(COMMON_SOURCES
|
||||
stat_keystroke.cpp
|
||||
stat_length.cpp
|
||||
stat_struct.cpp
|
||||
stat_word.cpp
|
||||
)
|
||||
set(COMMON_LIBRARIES
|
||||
magic_enum::magic_enum
|
||||
spdlog::spdlog
|
||||
TBB::tbb
|
||||
Threads::Threads
|
||||
)
|
||||
|
||||
add_executable(password-analyzer ${COMMON_SOURCES} analyzer.cpp)
|
||||
|
@ -13,5 +13,8 @@ int main() {
|
||||
stat_date(DataSource::YAHOO);
|
||||
stat_date(DataSource::CSDN);
|
||||
|
||||
stat_word(DataSource::YAHOO);
|
||||
stat_word(DataSource::CSDN);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
10000
data/words.txt
Normal file
10000
data/words.txt
Normal file
File diff suppressed because it is too large
Load Diff
24
defs.hpp
24
defs.hpp
@ -1,7 +1,10 @@
|
||||
#ifndef DEFS_HPP
|
||||
#define DEFS_HPP
|
||||
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#if __has_include(<generator>)
|
||||
#include <generator>
|
||||
#else
|
||||
@ -16,6 +19,25 @@ using std::operator""sv;
|
||||
|
||||
enum class DataSource { CSDN, YAHOO };
|
||||
|
||||
struct Timer {
|
||||
std::chrono::time_point<std::chrono::steady_clock> begin;
|
||||
std::string name;
|
||||
explicit Timer(std::string name) {
|
||||
spdlog::info("[{}] start...", name);
|
||||
this->name = name;
|
||||
this->begin = std::chrono::steady_clock::now();
|
||||
}
|
||||
~Timer() {
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
spdlog::info("[{}] time usage: {}ms", name,
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count());
|
||||
}
|
||||
};
|
||||
// the fucking magic with macro
|
||||
#define CONCAT_IMPL(x, y) x##y
|
||||
#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
|
||||
#define timeit(name) std::shared_ptr<Timer> MACRO_CONCAT(_timer_, __COUNTER__)(new Timer(name))
|
||||
|
||||
struct Cord {
|
||||
int x, y;
|
||||
bool operator==(const Cord &rhs) const { return x == rhs.x && y == rhs.y; }
|
||||
@ -23,6 +45,7 @@ struct Cord {
|
||||
|
||||
bool is_num(char c);
|
||||
bool is_alpha(char c);
|
||||
std::string tolower(const std::string &raw);
|
||||
|
||||
std::generator<std::string> passwords(const DataSource &source);
|
||||
|
||||
@ -30,5 +53,6 @@ void stat_date(const DataSource &source);
|
||||
void stat_keystroke(const DataSource &source);
|
||||
void stat_length(const DataSource &source);
|
||||
void stat_struct(const DataSource &source);
|
||||
void stat_word(const DataSource &source);
|
||||
|
||||
#endif // DEFS_HPP
|
||||
|
60
stat_word.cpp
Normal file
60
stat_word.cpp
Normal file
@ -0,0 +1,60 @@
|
||||
#include <execution>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include "defs.hpp"
|
||||
|
||||
std::vector<std::string> word_list;
|
||||
|
||||
void build_word_list() {
|
||||
spdlog::info("building word list...");
|
||||
|
||||
std::ifstream is("data/words.txt");
|
||||
std::string line;
|
||||
while (std::getline(is, line))
|
||||
if (line.size() >= 5) word_list.push_back(line);
|
||||
}
|
||||
|
||||
void stat_word(const DataSource &source) {
|
||||
timeit(fmt::format("stat_word({})", magic_enum::enum_name(source)));
|
||||
|
||||
if (word_list.empty()) build_word_list();
|
||||
|
||||
std::vector<std::string> passwords_vec;
|
||||
std::vector<std::vector<std::string>> words;
|
||||
std::vector<std::string> result;
|
||||
std::map<std::string, size_t> stat;
|
||||
|
||||
auto eval = [](auto const &str) {
|
||||
std::string lower = tolower(str);
|
||||
std::vector<std::string> ret;
|
||||
for (auto const &word : word_list)
|
||||
if (lower.find(word) != std::string::npos) ret.push_back(word);
|
||||
return ret;
|
||||
};
|
||||
|
||||
auto merge = [](auto const &l, auto const &r) {
|
||||
std::vector<std::string> ret = l;
|
||||
ret.insert(ret.end(), r.begin(), r.end());
|
||||
return ret;
|
||||
};
|
||||
|
||||
std::ranges::copy(passwords(source), std::back_inserter(passwords_vec));
|
||||
words.resize(passwords_vec.size());
|
||||
|
||||
{
|
||||
timeit("split words");
|
||||
std::transform(std::execution::par, passwords_vec.begin(), passwords_vec.end(), words.begin(), eval);
|
||||
}
|
||||
{
|
||||
timeit("merge results");
|
||||
result = std::reduce(std::execution::par, words.begin(), words.end(), std::vector<std::string>{}, merge);
|
||||
}
|
||||
|
||||
for (auto const &word : result) stat[word]++;
|
||||
std::vector<std::pair<std::string, size_t>> vec(stat.begin(), stat.end());
|
||||
std::sort(vec.begin(), vec.end(), [](auto const &lhs, auto const &rhs) { return lhs.second > rhs.second; });
|
||||
|
||||
for (auto const &[word, count] : vec | std::views::take(10)) spdlog::info("{}: {}", word, count);
|
||||
}
|
10
utils.cpp
10
utils.cpp
@ -1,3 +1,4 @@
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <fstream>
|
||||
#include <ranges>
|
||||
@ -9,14 +10,19 @@
|
||||
bool is_num(char c) { return c >= '0' && c <= '9'; }
|
||||
bool is_alpha(char c) { return std::isalpha(c); }
|
||||
|
||||
std::string tolower(const std::string &raw) {
|
||||
auto ret = raw;
|
||||
std::transform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::vector<std::string> data_buf[2];
|
||||
enum class BufStatus { INIT, FILLING, DONE };
|
||||
static BufStatus data_buf_ok[2] = {BufStatus::INIT, BufStatus::INIT};
|
||||
|
||||
std::generator<std::string> passwords(const DataSource &source) {
|
||||
if (data_buf_ok[magic_enum::enum_integer(source)] == BufStatus::DONE) {
|
||||
for (auto const &password : data_buf[magic_enum::enum_integer(source)])
|
||||
co_yield password;
|
||||
for (auto const &password : data_buf[magic_enum::enum_integer(source)]) co_yield password;
|
||||
co_return;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user