diff options
author | Aaron Ball <nullspoon@oper.io> | 2019-03-09 18:40:33 -0700 |
---|---|---|
committer | Aaron Ball <nullspoon@oper.io> | 2019-03-09 20:26:27 -0700 |
commit | c7033290ac21023c74f59c6f02a918cf6b2b36a8 (patch) | |
tree | 61569757b69602923b7376b3e9274a3af773d871 | |
parent | 93a26f6394546f53b9531cead1713311de0dda78 (diff) | |
download | upwgen-c7033290ac21023c74f59c6f02a918cf6b2b36a8.tar.gz upwgen-c7033290ac21023c74f59c6f02a918cf6b2b36a8.tar.xz |
Added unicode tiers
Previously, only the -i,--i18n switch existed, which would append the
world's top ~10 languages to the unicode array. This was often
problematic because many fonts don't have glyphs for the lesser-used
writing systems. This also had the inadvertant effect of making many of
the generated i18n passwords unacceptable for many services. This is
because the more common unicode characters are accepted (eg: latin
basic, extended, suppliment, etc), but the less common are considered
"disallowed symbols", making most of the generated passwords
unacceptable.
This introduces the -1, -2, -3, and -4 switches to alleviate this. The
-1 switch includes only the most used scripts in the world (Latin and
IPA phonetic), which covers some of the biggest languages in use making
these passwords most likely to be acceptable. The subsequent switches
include lesser and lesser used scripts in sequence.
This commit splits out all i18n functions into the new i18n_cat library
to clean up main. Also made intrcat hidden, implementing
i18n_cat_ascii_* functions to replace all external calls to it.
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | src/i18n_cat.c | 282 | ||||
-rw-r--r-- | src/i18n_cat.h | 47 | ||||
-rw-r--r-- | src/main.c | 134 |
4 files changed, 369 insertions, 98 deletions
@@ -4,7 +4,9 @@ out = upwgen PREFIX = /usr/bin all: - $(CC) $(CCOPTS) src/main.c -o $(out) + @if [ ! -d obj ]; then mkdir obj; fi + $(CC) $(CCOPTS) -c src/i18n_cat.c -o obj/i18n_cat.o + $(CC) $(CCOPTS) src/main.c obj/*.o -o $(out) install: mkdir -p $(DESTDIR)/$(PREFIX) diff --git a/src/i18n_cat.c b/src/i18n_cat.c new file mode 100644 index 0000000..18712ac --- /dev/null +++ b/src/i18n_cat.c @@ -0,0 +1,282 @@ +/** + * upwgen generates random internationalized passwords + * Copyright (C) 2019 Aaron Ball <nullspoon@oper.io> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include "i18n_cat.h" + + +/** + * intrcat: + * Integer range concatenate. Appends the specified integer range to an int + * array. + * + * @arr Array to cat range of ints to + * @rstart Range start integer + * @rend Range end integer + * + * @return Number of integers appended to array + */ +unsigned int intrcat(unsigned int* arr, unsigned int rstart, unsigned int rend) { + int i = 0; + int total = rend - rstart; // Calculate our return count + + while(arr[i] != '\0') + i++; + + while(rstart <= rend) { + arr[i] = rstart; + //printf("% -4d % -7d %lc\n", i, rstart, rstart); + rstart++; + i++; + } + + arr[i] = '\0'; + return total; +} + + +/** + * i18n_cat_ascii_upper: + * Appends the ascii [english] upper case characters to the destination array. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_ascii_upper(unsigned int* dest) { + return intrcat(dest, 65, 90); +} + + +/** + * i18n_cat_ascii_lower: + * Appends the ascii [english] upper case characters to the destination array. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_ascii_lower(unsigned int* dest) { + return intrcat(dest, 97, 122); +} + + +/** + * i18n_cat_ascii_numerals: + * Appends the ascii numerals to the dest string. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_ascii_numerals(unsigned int* dest) { + return intrcat(dest, 48, 57); +} + + +/** + * i18n_cat_ascii_symbols: + * Appends the ascii symbols to the dest string. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_ascii_symbols(unsigned int* dest) { + unsigned int count = 0; + count += intrcat(dest, 33, 47); // English symbols ! - / + count += intrcat(dest, 58, 64); // English symbols : - @ + count += intrcat(dest, 91, 96); // English symbols [ - ` + count += intrcat(dest, 123, 126); // English symbols { - ~ + return count; +} + + +/** + * i18n_cat_ascii: + * Appends the entire ascii printable characters (without the space at dec 32) + * range to the dest string. This includes English numerals, upper, lower, and + * symbols. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_ascii(unsigned int* dest) { + return intrcat(dest, 0x0021, 0x007E); +} + + +/** + * i18n_cat_one: + * Appends the first group of unicode characters. This group covers some of the + * most common languages in the world, which use the latin script. This also + * includes the IPA extension characters. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_one(unsigned int* dest) { + unsigned int count = 0; + count += intrcat(dest, 0x00A1, 0x00FF); // Latin-1 Suppliment + count += intrcat(dest, 0x0100, 0x017F); // Latin extended A + count += intrcat(dest, 0x0180, 0x024F); // Latin extended B + count += intrcat(dest, 0x0250, 0x02AF); // IPA Extensions + return count; +} + + +/** + * i18n_cat_two: + * Appends the second most popular group of unicode characters. This group + * attempts to cover scripts used by the second most common languages in the + * world. In this case, this includes the Devanagari (Hindi, Sanskrit), Hebrew, + * Arabic, and Cyrillic blocks. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_two(unsigned int* dest) { + unsigned int count = 0; + + // Devanagari (Sanskrit, Hindi, Marathi, Sindhi, Nepali, etc) + // This does not include vowels, as those are character modifiers that do not + // work with other character sets. + count += intrcat(dest, 0x0904, 0x0939); + count += intrcat(dest, 0x0958, 0x0961); + count += intrcat(dest, 0x0964, 0x096F); + count += intrcat(dest, 0x0972, 0x097F); + + // Only includes Hebrew consonants, since vowel marks require modification of + // a previous character, which doesn't work when combined with other scripts. + count += intrcat(dest, 0x05D0, 0x05EA); // Hebrew + count += intrcat(dest, 0x05F0, 0x05F4); // Hebrew + + // Arabic (only consonants) + count += intrcat(dest, 0x061E, 0x06FF); + + // Cyrillic and Cyrillic suppliment + count += intrcat(dest, 0x0400, 0x04F0); + count += intrcat(dest, 0x0500, 0x052F); + return count; +} + + +/** + * i18n_cat_three: + * Appends the third most popular group of unicode characters. This group + * attempts to cover scripts used by the third most common languages in the + * world. In this case, this includes the Armenian, Bengali, Greek, and Coptic + * blocks. + * + * NOTE: Using this function will likely introduce characters for which your + * font does not have glyphs. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_three(unsigned int* dest) { + unsigned int count = 0; + + // Armenian + // Armenian has a gap at 0x0557, 0x0558, 0x058B, and 0x058C + count += intrcat(dest, 0x0531, 0x0556); + count += intrcat(dest, 0x0559, 0x058A); + + // Bengali + count += intrcat(dest, 0x0985, 0x098C); + count += intrcat(dest, 0x098F, 0x0990); + count += intrcat(dest, 0x0993, 0x09A8); + count += intrcat(dest, 0x09AA, 0x09B0); + count += intrcat(dest, 0x09B2, 0x09B2); + count += intrcat(dest, 0x09B6, 0x09B9); + count += intrcat(dest, 0x09B6, 0x09B9); + count += intrcat(dest, 0x09DC, 0x09DD); + count += intrcat(dest, 0x09DF, 0x09D3); + count += intrcat(dest, 0x09D6, 0x09DC); + + // Greek and Coptic + // These are complicated because they are missing assigned values for + // 0x0378, 0x0379, 0x0380, 0x0381, 0x0382, 0x0383, 0x038B, 0x038D, 0x03A2 + count += intrcat(dest, 0x0370, 0x0377); + count += intrcat(dest, 0x037A, 0x037F); + count += intrcat(dest, 0x0384, 0x038A); + count += intrcat(dest, 0x038C, 0x038C); + count += intrcat(dest, 0x038C, 0x038C); + count += intrcat(dest, 0x038E, 0x03A1); + count += intrcat(dest, 0x03A3, 0x03FF); + + return count; +} + + +/** + * i18n_cat_four: + * Appends the forth most popular group of unicode characters. This group + * attempts to cover scripts used by the forth most common languages in the + * world. In this case, this includes the Thaana, NKo, Samaritan, Mandaic, + * Syriac, Runic, Tifinagh, and Georgian blocks. + * + * NOTE: Using this function will likely introduce characters for which your + * font does not have glyphs. + * + * @dest Destination int array to copy into + * + * @return Count of characters appended + */ +unsigned int i18n_cat_four(unsigned int* dest) { + unsigned int count = 0; + + count += intrcat(dest, 0x0780, 0x07A5); // Thaana + count += intrcat(dest, 0x07C0, 0x07EA); // NKo + count += intrcat(dest, 0x0800, 0x0815); // Samaritan + count += intrcat(dest, 0x0830, 0x083E); // Samaritan + count += intrcat(dest, 0x0840, 0x085B); // Mandaic + + count += intrcat(dest, 0x0710, 0x072F); // Syriac + count += intrcat(dest, 0x074D, 0x074F); // Syriac + + count += intrcat(dest, 0x16A0, 0x16F8); // Runic + count += intrcat(dest, 0x2D30, 0x2D67); // Tifinagh + + // Georgian + count += intrcat(dest, 0x10A0, 0x10C5); + count += intrcat(dest, 0x10C7, 0x10C7); + count += intrcat(dest, 0x10CD, 0x10CD); + count += intrcat(dest, 0x10D0, 0x10FF); + + return count; +} + + +/** + * print_intl_arr: + * Prints array containing unsigned ints representing internal characters. + * Outputs to STDOUT the unicode decimal, followed by the unicode character. + * + * @arr Unicode array to print + */ +void i18n_dump_arr(unsigned int* arr) { + int i = 0; // cursor + + while(arr[i] != '\0') { + printf("0x%04x %5d: [%lc]\n", arr[i], arr[i], arr[i]); + i++; + } +} diff --git a/src/i18n_cat.h b/src/i18n_cat.h new file mode 100644 index 0000000..02ac3f0 --- /dev/null +++ b/src/i18n_cat.h @@ -0,0 +1,47 @@ +/** + * upwgen generates random internationalized passwords + * Copyright (C) 2019 Aaron Ball <nullspoon@oper.io> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <locale.h> + +/* Deprecated */ +unsigned int i18n_cat_arr(unsigned int*); + +/** + * Functions to append to the specified char array the ascii basic latin + * characters. + */ +unsigned int i18n_cat_ascii(unsigned int*); +unsigned int i18n_cat_ascii_numerals(unsigned int*); +unsigned int i18n_cat_ascii_upper(unsigned int*); +unsigned int i18n_cat_ascii_lower(unsigned int*); +unsigned int i18n_cat_ascii_symbols(unsigned int*); + + +/** + * Functions to append to the specified char array the first, second, third, + * and forth most used character set groups around the world. + */ +unsigned int i18n_cat_one(unsigned int*); +unsigned int i18n_cat_two(unsigned int*); +unsigned int i18n_cat_three(unsigned int*); +unsigned int i18n_cat_four(unsigned int*); + +void i18n_dump_arr(unsigned int*); @@ -19,97 +19,15 @@ #include <time.h> #include <locale.h> - -// intrcat: -// Integer range concatenate. Appends the specified integer range to an int -// array. -// -// @arr Array to cat range of ints to -// @rstart Range start integer -// @rend Range end integer -// -// @return Number of integers appended to array -int intrcat(unsigned int* arr, unsigned int rstart, unsigned int rend) { - int i = 0; - int total = rend - rstart; // Calculate our return count - - while(arr[i] != '\0') - i++; - - while(rstart <= rend) { - arr[i] = rstart; - //printf("% -4d % -7d %lc\n", i, rstart, rstart); - rstart++; - i++; - } - - arr[i] = '\0'; - return total; -} - - -// populate_intl_arr: -// Populates an unsigned integer array with common unicode (utf-8) language -// alphabets and symbols. -// -// Some example unicode integer ranges: -// 33 - 126 Standard english ascii -// 256 - 383 Latin extended A block -// 256 - 383 Latin extended B block -// 913 - 969 Greek -// 1040 - 1103 Russian -// 1329 - 1414 Armenian -// 1488 - 1514 Hebrew -// 65166 - 65265 Arabic -// -// No/rare font support (boo!) -// 2325 - 2373 Devanagari (Hindi) -// 2437 - 2509 Bengali alphabet -// 2949 - 3020 Tamil -// 3585 - 3663 Thai -// 5792 - 5880 Runic -// 11392 - 11483 Coptic alphabet -// 66560 - 66639 Deseret -// -// @out Unsigned int array to be populated. -// -// @return Size of the array contents -int populate_intl_arr(unsigned int* out) { - int count = 0; - - // Populate the array - count += intrcat(out, 33, 126); // English - count += intrcat(out, 256, 383); // Latin A block - count += intrcat(out, 399, 691); // Latin B block - count += intrcat(out, 913, 969); // Greek - count += intrcat(out, 1040, 1103); // Russian - count += intrcat(out, 1329, 1414); // Armenian - count += intrcat(out, 1488, 1514); // Hebrew - count += intrcat(out, 65166, 65265); // Arabic - - return count; -} - - -// print_intl_arr: -// Prints array containing unsigned ints representing internal characters. -// Outputs to STDOUT the unicode decimal, followed by the unicode character. -// -// @arr Unicode array to print -void print_intl_arr(unsigned int* arr) { - int i = 0; // cursor - - while(arr[i] != '\0') { - printf("%5d: [%lc]\n", arr[i], arr[i]); - i++; - } -} +#include "i18n_cat.h" void usage() { printf( "Upwgen is a password generator with international support. If no length\n" - "is specified, defaults to 32 characters output length\n\n" + "is specified, defaults to 32 characters output length, selecting from\n" + "the standard English character set (lower case, upper case, numerals,\n" + "and symbols).\n\n" "Usage:\n upwgen [options] [length]\n\n" "Options:\n" " -c,--capitalize Include at least one capital letter in output\n" @@ -117,6 +35,10 @@ void usage() { " -n,--numerals Include at least one numeral in output\n" " -y,--symbols Include at least one symbol in output\n" " -i,--i18n Include at least one international letter in output\n" + " -1 Include chars from the most used scripts in the world\n" + " -2 Include chars from the second most used scripts in the world\n" + " -3 Include chars from the third most used scripts in the world\n" + " -4 Include chars from the forth most used scripts in the world\n" "\n" " -h,--help Print this help text\n" ); @@ -125,11 +47,11 @@ void usage() { int main(int argc, char* argv[]) { struct timespec ts; // Timespec for seeding rng - int count; // Number of chars to choose from + unsigned int count; // Number of chars to choose from int len; // Password length int i; // Arg index unsigned long seed; // Seed for the RNG (current seconds * nanoseconds) - unsigned int chars[1024]; // Uint array to hold international chars + unsigned int chars[4096]; // Uint array to hold international chars // Initialize count = 0; @@ -140,21 +62,39 @@ int main(int argc, char* argv[]) { while(i < argc) { if(strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--capitals") == 0) { - count += intrcat(chars, 65, 90); // English uppercase + count += i18n_cat_ascii_upper(chars); + } else if(strcmp(argv[i], "-l") == 0 || strcmp(argv[i], "--lower") == 0) { - count += intrcat(chars, 97, 122); // English lower case + count += i18n_cat_ascii_lower(chars); + } else if(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--numerals") == 0) { - count += intrcat(chars, 48, 57); // English numerals + count += i18n_cat_ascii_numerals(chars); + } else if(strcmp(argv[i], "-y") == 0 || strcmp(argv[i], "--symbols") == 0) { - count += intrcat(chars, 33, 47); // English symbols ! - / - count += intrcat(chars, 58, 64); // English symbols : - @ - count += intrcat(chars, 91, 96); // English symbols [ - ` - count += intrcat(chars, 123, 126); // English symbols { - ~ + count += i18n_cat_ascii_symbols(chars); + } else if(strcmp(argv[i], "-i") == 0 || strcmp(argv[i], "--i18n") == 0) { - count += populate_intl_arr(chars); + count += i18n_cat_one(chars); + count += i18n_cat_two(chars); + count += i18n_cat_three(chars); + count += i18n_cat_four(chars); + + } else if(strcmp(argv[i], "-1") == 0) { + count += i18n_cat_one(chars); + + } else if(strcmp(argv[i], "-2") == 0) { + count += i18n_cat_two(chars); + + } else if(strcmp(argv[i], "-3") == 0) { + count += i18n_cat_three(chars); + + } else if(strcmp(argv[i], "-4") == 0) { + count += i18n_cat_four(chars); + } else if(strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { usage(); return 0; + } else { // If we reach this block, the user specified a custom length (or // fatfingered something). Test for ability to convert from str to int @@ -172,7 +112,7 @@ int main(int argc, char* argv[]) { // If no charset was specified, use standard ascii 33 - 126 chars, which // includes english lower case, upper case, numbers, and some symbols. if(chars[0] == '\0') - count += intrcat(chars, 33, 126); + count += i18n_cat_ascii(chars); // Get the random data seed clock_gettime(CLOCK_REALTIME, &ts); |