add word analyzer
This commit is contained in:
parent
e64ee43824
commit
bb41e30cba
@ -49,6 +49,10 @@ FetchContent_Declare(
|
|||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(magic_enum)
|
FetchContent_MakeAvailable(magic_enum)
|
||||||
|
|
||||||
|
message(STATUS "[DEPS] Processing tbb")
|
||||||
|
find_package(TBB REQUIRED COMPONENTS tbb)
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
message(STATUS "[DEPS] Processing generator")
|
message(STATUS "[DEPS] Processing generator")
|
||||||
file(DOWNLOAD
|
file(DOWNLOAD
|
||||||
"https://raw.githubusercontent.com/lewissbaker/generator/master/include/__generator.hpp"
|
"https://raw.githubusercontent.com/lewissbaker/generator/master/include/__generator.hpp"
|
||||||
@ -62,10 +66,13 @@ set(COMMON_SOURCES
|
|||||||
stat_keystroke.cpp
|
stat_keystroke.cpp
|
||||||
stat_length.cpp
|
stat_length.cpp
|
||||||
stat_struct.cpp
|
stat_struct.cpp
|
||||||
|
stat_word.cpp
|
||||||
)
|
)
|
||||||
set(COMMON_LIBRARIES
|
set(COMMON_LIBRARIES
|
||||||
magic_enum::magic_enum
|
magic_enum::magic_enum
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
|
TBB::tbb
|
||||||
|
Threads::Threads
|
||||||
)
|
)
|
||||||
|
|
||||||
add_executable(password-analyzer ${COMMON_SOURCES} analyzer.cpp)
|
add_executable(password-analyzer ${COMMON_SOURCES} analyzer.cpp)
|
||||||
|
@ -13,5 +13,8 @@ int main() {
|
|||||||
stat_date(DataSource::YAHOO);
|
stat_date(DataSource::YAHOO);
|
||||||
stat_date(DataSource::CSDN);
|
stat_date(DataSource::CSDN);
|
||||||
|
|
||||||
|
stat_word(DataSource::YAHOO);
|
||||||
|
stat_word(DataSource::CSDN);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
10000
data/words.txt
Normal file
10000
data/words.txt
Normal file
File diff suppressed because it is too large
Load Diff
24
defs.hpp
24
defs.hpp
@ -1,7 +1,10 @@
|
|||||||
#ifndef DEFS_HPP
|
#ifndef DEFS_HPP
|
||||||
#define DEFS_HPP
|
#define DEFS_HPP
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
#if __has_include(<generator>)
|
#if __has_include(<generator>)
|
||||||
#include <generator>
|
#include <generator>
|
||||||
#else
|
#else
|
||||||
@ -16,6 +19,25 @@ using std::operator""sv;
|
|||||||
|
|
||||||
enum class DataSource { CSDN, YAHOO };
|
enum class DataSource { CSDN, YAHOO };
|
||||||
|
|
||||||
|
struct Timer {
|
||||||
|
std::chrono::time_point<std::chrono::steady_clock> begin;
|
||||||
|
std::string name;
|
||||||
|
explicit Timer(std::string name) {
|
||||||
|
spdlog::info("[{}] start...", name);
|
||||||
|
this->name = name;
|
||||||
|
this->begin = std::chrono::steady_clock::now();
|
||||||
|
}
|
||||||
|
~Timer() {
|
||||||
|
auto end = std::chrono::steady_clock::now();
|
||||||
|
spdlog::info("[{}] time usage: {}ms", name,
|
||||||
|
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// the fucking magic with macro
|
||||||
|
#define CONCAT_IMPL(x, y) x##y
|
||||||
|
#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
|
||||||
|
#define timeit(name) std::shared_ptr<Timer> MACRO_CONCAT(_timer_, __COUNTER__)(new Timer(name))
|
||||||
|
|
||||||
struct Cord {
|
struct Cord {
|
||||||
int x, y;
|
int x, y;
|
||||||
bool operator==(const Cord &rhs) const { return x == rhs.x && y == rhs.y; }
|
bool operator==(const Cord &rhs) const { return x == rhs.x && y == rhs.y; }
|
||||||
@ -23,6 +45,7 @@ struct Cord {
|
|||||||
|
|
||||||
bool is_num(char c);
|
bool is_num(char c);
|
||||||
bool is_alpha(char c);
|
bool is_alpha(char c);
|
||||||
|
std::string tolower(const std::string &raw);
|
||||||
|
|
||||||
std::generator<std::string> passwords(const DataSource &source);
|
std::generator<std::string> passwords(const DataSource &source);
|
||||||
|
|
||||||
@ -30,5 +53,6 @@ void stat_date(const DataSource &source);
|
|||||||
void stat_keystroke(const DataSource &source);
|
void stat_keystroke(const DataSource &source);
|
||||||
void stat_length(const DataSource &source);
|
void stat_length(const DataSource &source);
|
||||||
void stat_struct(const DataSource &source);
|
void stat_struct(const DataSource &source);
|
||||||
|
void stat_word(const DataSource &source);
|
||||||
|
|
||||||
#endif // DEFS_HPP
|
#endif // DEFS_HPP
|
||||||
|
60
stat_word.cpp
Normal file
60
stat_word.cpp
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#include <execution>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <numeric>
|
||||||
|
#include <vector>
|
||||||
|
#include "defs.hpp"
|
||||||
|
|
||||||
|
std::vector<std::string> word_list;
|
||||||
|
|
||||||
|
void build_word_list() {
|
||||||
|
spdlog::info("building word list...");
|
||||||
|
|
||||||
|
std::ifstream is("data/words.txt");
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(is, line))
|
||||||
|
if (line.size() >= 5) word_list.push_back(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
void stat_word(const DataSource &source) {
|
||||||
|
timeit(fmt::format("stat_word({})", magic_enum::enum_name(source)));
|
||||||
|
|
||||||
|
if (word_list.empty()) build_word_list();
|
||||||
|
|
||||||
|
std::vector<std::string> passwords_vec;
|
||||||
|
std::vector<std::vector<std::string>> words;
|
||||||
|
std::vector<std::string> result;
|
||||||
|
std::map<std::string, size_t> stat;
|
||||||
|
|
||||||
|
auto eval = [](auto const &str) {
|
||||||
|
std::string lower = tolower(str);
|
||||||
|
std::vector<std::string> ret;
|
||||||
|
for (auto const &word : word_list)
|
||||||
|
if (lower.find(word) != std::string::npos) ret.push_back(word);
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto merge = [](auto const &l, auto const &r) {
|
||||||
|
std::vector<std::string> ret = l;
|
||||||
|
ret.insert(ret.end(), r.begin(), r.end());
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::ranges::copy(passwords(source), std::back_inserter(passwords_vec));
|
||||||
|
words.resize(passwords_vec.size());
|
||||||
|
|
||||||
|
{
|
||||||
|
timeit("split words");
|
||||||
|
std::transform(std::execution::par, passwords_vec.begin(), passwords_vec.end(), words.begin(), eval);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
timeit("merge results");
|
||||||
|
result = std::reduce(std::execution::par, words.begin(), words.end(), std::vector<std::string>{}, merge);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto const &word : result) stat[word]++;
|
||||||
|
std::vector<std::pair<std::string, size_t>> vec(stat.begin(), stat.end());
|
||||||
|
std::sort(vec.begin(), vec.end(), [](auto const &lhs, auto const &rhs) { return lhs.second > rhs.second; });
|
||||||
|
|
||||||
|
for (auto const &[word, count] : vec | std::views::take(10)) spdlog::info("{}: {}", word, count);
|
||||||
|
}
|
10
utils.cpp
10
utils.cpp
@ -1,3 +1,4 @@
|
|||||||
|
#include <algorithm>
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <ranges>
|
#include <ranges>
|
||||||
@ -9,14 +10,19 @@
|
|||||||
bool is_num(char c) { return c >= '0' && c <= '9'; }
|
bool is_num(char c) { return c >= '0' && c <= '9'; }
|
||||||
bool is_alpha(char c) { return std::isalpha(c); }
|
bool is_alpha(char c) { return std::isalpha(c); }
|
||||||
|
|
||||||
|
std::string tolower(const std::string &raw) {
|
||||||
|
auto ret = raw;
|
||||||
|
std::transform(ret.begin(), ret.end(), ret.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static std::vector<std::string> data_buf[2];
|
static std::vector<std::string> data_buf[2];
|
||||||
enum class BufStatus { INIT, FILLING, DONE };
|
enum class BufStatus { INIT, FILLING, DONE };
|
||||||
static BufStatus data_buf_ok[2] = {BufStatus::INIT, BufStatus::INIT};
|
static BufStatus data_buf_ok[2] = {BufStatus::INIT, BufStatus::INIT};
|
||||||
|
|
||||||
std::generator<std::string> passwords(const DataSource &source) {
|
std::generator<std::string> passwords(const DataSource &source) {
|
||||||
if (data_buf_ok[magic_enum::enum_integer(source)] == BufStatus::DONE) {
|
if (data_buf_ok[magic_enum::enum_integer(source)] == BufStatus::DONE) {
|
||||||
for (auto const &password : data_buf[magic_enum::enum_integer(source)])
|
for (auto const &password : data_buf[magic_enum::enum_integer(source)]) co_yield password;
|
||||||
co_yield password;
|
|
||||||
co_return;
|
co_return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user