add word analyzer

This commit is contained in:
Paul Pan 2023-11-04 00:22:45 +08:00
parent e64ee43824
commit bb41e30cba
6 changed files with 10104 additions and 4 deletions

View File

@ -49,6 +49,10 @@ FetchContent_Declare(
) )
FetchContent_MakeAvailable(magic_enum) FetchContent_MakeAvailable(magic_enum)
message(STATUS "[DEPS] Processing tbb")
find_package(TBB REQUIRED COMPONENTS tbb)
find_package(Threads REQUIRED)
message(STATUS "[DEPS] Processing generator") message(STATUS "[DEPS] Processing generator")
file(DOWNLOAD file(DOWNLOAD
"https://raw.githubusercontent.com/lewissbaker/generator/master/include/__generator.hpp" "https://raw.githubusercontent.com/lewissbaker/generator/master/include/__generator.hpp"
@ -62,10 +66,13 @@ set(COMMON_SOURCES
stat_keystroke.cpp stat_keystroke.cpp
stat_length.cpp stat_length.cpp
stat_struct.cpp stat_struct.cpp
stat_word.cpp
) )
set(COMMON_LIBRARIES set(COMMON_LIBRARIES
magic_enum::magic_enum magic_enum::magic_enum
spdlog::spdlog spdlog::spdlog
TBB::tbb
Threads::Threads
) )
add_executable(password-analyzer ${COMMON_SOURCES} analyzer.cpp) add_executable(password-analyzer ${COMMON_SOURCES} analyzer.cpp)

View File

@ -13,5 +13,8 @@ int main() {
stat_date(DataSource::YAHOO); stat_date(DataSource::YAHOO);
stat_date(DataSource::CSDN); stat_date(DataSource::CSDN);
stat_word(DataSource::YAHOO);
stat_word(DataSource::CSDN);
return 0; return 0;
} }

10000
data/words.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,10 @@
#ifndef DEFS_HPP #ifndef DEFS_HPP
#define DEFS_HPP #define DEFS_HPP
#include <chrono>
#include <memory>
#include <string> #include <string>
#include <utility>
#if __has_include(<generator>) #if __has_include(<generator>)
#include <generator> #include <generator>
#else #else
@ -16,13 +19,33 @@ using std::operator""sv;
enum class DataSource { CSDN, YAHOO }; enum class DataSource { CSDN, YAHOO };
struct Timer {
std::chrono::time_point<std::chrono::steady_clock> begin;
std::string name;
explicit Timer(std::string name) {
spdlog::info("[{}] start...", name);
this->name = name;
this->begin = std::chrono::steady_clock::now();
}
~Timer() {
auto end = std::chrono::steady_clock::now();
spdlog::info("[{}] time usage: {}ms", name,
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count());
}
};
// the fucking magic with macro
#define CONCAT_IMPL(x, y) x##y
#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
#define timeit(name) std::shared_ptr<Timer> MACRO_CONCAT(_timer_, __COUNTER__)(new Timer(name))
struct Cord { struct Cord {
int x, y; int x, y;
bool operator==(const Cord &rhs) const { return x == rhs.x && y == rhs.y; } bool operator==(const Cord &rhs) const { return x == rhs.x && y == rhs.y; }
}; };
bool is_num(char c); bool is_num(char c);
bool is_alpha(char c); bool is_alpha(char c);
std::string tolower(const std::string &raw);
std::generator<std::string> passwords(const DataSource &source); std::generator<std::string> passwords(const DataSource &source);
@ -30,5 +53,6 @@ void stat_date(const DataSource &source);
void stat_keystroke(const DataSource &source); void stat_keystroke(const DataSource &source);
void stat_length(const DataSource &source); void stat_length(const DataSource &source);
void stat_struct(const DataSource &source); void stat_struct(const DataSource &source);
void stat_word(const DataSource &source);
#endif // DEFS_HPP #endif // DEFS_HPP

60
stat_word.cpp Normal file
View File

@ -0,0 +1,60 @@
#include <execution>
#include <fstream>
#include <map>
#include <numeric>
#include <vector>
#include "defs.hpp"
std::vector<std::string> word_list;
void build_word_list() {
spdlog::info("building word list...");
std::ifstream is("data/words.txt");
std::string line;
while (std::getline(is, line))
if (line.size() >= 5) word_list.push_back(line);
}
void stat_word(const DataSource &source) {
timeit(fmt::format("stat_word({})", magic_enum::enum_name(source)));
if (word_list.empty()) build_word_list();
std::vector<std::string> passwords_vec;
std::vector<std::vector<std::string>> words;
std::vector<std::string> result;
std::map<std::string, size_t> stat;
auto eval = [](auto const &str) {
std::string lower = tolower(str);
std::vector<std::string> ret;
for (auto const &word : word_list)
if (lower.find(word) != std::string::npos) ret.push_back(word);
return ret;
};
auto merge = [](auto const &l, auto const &r) {
std::vector<std::string> ret = l;
ret.insert(ret.end(), r.begin(), r.end());
return ret;
};
std::ranges::copy(passwords(source), std::back_inserter(passwords_vec));
words.resize(passwords_vec.size());
{
timeit("split words");
std::transform(std::execution::par, passwords_vec.begin(), passwords_vec.end(), words.begin(), eval);
}
{
timeit("merge results");
result = std::reduce(std::execution::par, words.begin(), words.end(), std::vector<std::string>{}, merge);
}
for (auto const &word : result) stat[word]++;
std::vector<std::pair<std::string, size_t>> vec(stat.begin(), stat.end());
std::sort(vec.begin(), vec.end(), [](auto const &lhs, auto const &rhs) { return lhs.second > rhs.second; });
for (auto const &[word, count] : vec | std::views::take(10)) spdlog::info("{}: {}", word, count);
}

View File

@ -1,3 +1,4 @@
#include <algorithm>
#include <cctype> #include <cctype>
#include <fstream> #include <fstream>
#include <ranges> #include <ranges>
@ -9,14 +10,19 @@
bool is_num(char c) { return c >= '0' && c <= '9'; } bool is_num(char c) { return c >= '0' && c <= '9'; }
bool is_alpha(char c) { return std::isalpha(c); } bool is_alpha(char c) { return std::isalpha(c); }
std::string tolower(const std::string &raw) {
auto ret = raw;
std::transform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c) { return std::tolower(c); });
return ret;
}
static std::vector<std::string> data_buf[2]; static std::vector<std::string> data_buf[2];
enum class BufStatus { INIT, FILLING, DONE }; enum class BufStatus { INIT, FILLING, DONE };
static BufStatus data_buf_ok[2] = {BufStatus::INIT, BufStatus::INIT}; static BufStatus data_buf_ok[2] = {BufStatus::INIT, BufStatus::INIT};
std::generator<std::string> passwords(const DataSource &source) { std::generator<std::string> passwords(const DataSource &source) {
if (data_buf_ok[magic_enum::enum_integer(source)] == BufStatus::DONE) { if (data_buf_ok[magic_enum::enum_integer(source)] == BufStatus::DONE) {
for (auto const &password : data_buf[magic_enum::enum_integer(source)]) for (auto const &password : data_buf[magic_enum::enum_integer(source)]) co_yield password;
co_yield password;
co_return; co_return;
} }