#!/bin/sh
# Copyright (C) 2019-2020 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This script intends to facilitate spell checking of source/doc files.
# It:
# - transforms the files into a list of lowercase words
# - prefixes each word with the frequency
# - filters out words within a frequency range
# - sorts the words, longest first
#
# If '-c' is passed as option, it operates on the C comments only, rather than
# on the entire file.
#
# For:
# ...
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
# $ ./gdb/contrib/words.sh -c $files
# ...
# it generates a list of ~15000 words prefixed with frequency.
#
# This could be used to generate a dictionary that is kept as part of the
# sources, against which new code can be checked, generating a warning or
# error. The hope is that misspellings would trigger this frequently, and rare
# words rarely, otherwise the burden of updating the dictionary would be too
# much.
#
# And for:
# ...
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
# $ ./gdb/contrib/words.sh -c -f 1 $files
# ...
# it generates a list of ~5000 words with frequency 1.
#
# This can be used to scan for misspellings manually.
#
minfreq=
maxfreq=
c=false
while [ $# -gt 0 ]; do
case "$1" in
-c)
c=true
shift
;;
--freq|-f)
minfreq=$2
maxfreq=$2
shift 2
;;
--min)
minfreq=$2
if [ "$maxfreq" = "" ]; then
maxfreq=0
fi
shift 2
;;
--max)
maxfreq=$2
if [ "$minfreq" = "" ]; then
minfreq=0
fi
shift 2
;;
*)
break;
;;
esac
done
if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
minfreq=0
maxfreq=0
fi
awkfile=$(mktemp)
trap 'rm -f "$awkfile"' EXIT
cat > "$awkfile" <<EOF
BEGIN {
in_comment=0
}
// {
line=\$0
}
/\/\*/ {
in_comment=1
sub(/.*\/\*/, "", line)
}
/\*\// {
sub(/\*\/.*/, "", line)
in_comment=0
print line
next
}
// {
if (in_comment) {
print line
}
}
EOF
# Stabilize sort.
export LC_ALL=C
if $c; then
awk \
-f "$awkfile" \
-- "$@"
else
cat "$@"
fi \
| sed \
-e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
-e 's/\[/\n/g' \
-e 's/\]/\n/g' \
-e "s/'/\n/g" \
-e 's/[0-9][0-9]*/\n/g' \
-e 's/[ \t]*//g' \
| tr '[:upper:]' '[:lower:]' \
| sort \
| uniq -c \
| awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
&& ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
| awk '{ print length($0) " " $0; }' \
| sort -n -r \
| cut -d ' ' -f 2-