#!/bin/bash

# Known OpenType items for *assembling* a glyph which create warnings:
#  Most of these are from texgyre.
# aogonekacute - a with ogonek and acute accent
# cybreve - the cyrillic breve is a different shape from the latin breve.
# eogonekacute - e with ogonek and acute accent
# idotaccent - apparently used for creating i with dot and accent in turkish.
# iogonekacute - i with ogonek and acute accent
# i_dot in FreeSerif - assume idotaccent although the font lacks accented SC
# jacute - j with acute accent
# oogonekacute - o with ogonek and acute accent
# orogate - an old (14th century CE) Polish o with vertical lines above and below it
#  see https://www.unicode.org/L2/L2021/21039-old-polish-o.pdf
# ubrevebelowinverted - u with inverted breve below it
# ustraitcy - cyrillic straight u, for adding diacriticals (Vollkorn)
# ustraitstrokecy - cyrillic straight u with stroke for adding diacriticals (Vollkorn)
# yi_yicy - a ligature for ukrainian yi yi (Vollkorn)
#
# Identifications are often based on random posts google found at forum.glyphsapp.com/
# which is for a macOS font editor.
#
# A mediaevalist font such as Junicode has a lot more like this.

LC_ALL=C
VARIANT=0 # normal, items are *.sc

# Look at a font which contains small caps, find the codepoints they cover.
test -f /usr/bin/otfinfo || echo "you need to install lcdf-typetools"
test -f /usr/bin/otfinfo || exit 2

# point to list of glyphs from
# https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt
GLYPHLIST=/sources/scripts/font-analysis/glyphlist.txt

test -r "$GLYPHLIST" || echo "cannot read $GLYPHLIST"
test -r "$GLYPHLIST" || exit 2
if [ "$#" -ne 1 ]; then
	echo "pass the /full/path/to/filename.{otf,ttf} as a single argument"
	exit 2
fi

# Insecure temp files, assumes only running one of these at a time
# so clear out any temp files from previous run
>/tmp/possible-sc-items
>/tmp/all-sc-items
>/tmp/named-sc-items
>/tmp/uni-sc-items
>/tmp/uninum-sc-items
>/tmp/alpha-sc-items
>/tmp/alnum-sc-items
echo "looking for all possible small caps in $1"
otfinfo -g $1 | grep '\.sc' >/tmp/possible-sc-items
if [ $? -ne 0 ]; then
	echo "did not find any '.sc' in $1, looking for '^sc.'"
	otfinfo -g $1 | grep '^sc\.' >/tmp/possible-sc-items
	if [ $? -eq 0 ]; then
		VARIANT=1
	else
		echo "No Small Caps found in $1"
		exit
	fi
fi

if [ $VARIANT = "0" ]; then
	# Drop everything after first decimal point
	# because of things like aacute.SngStory.sc
	echo "reducing to only the '.*.sc' small caps"
	#cat /tmp/possible-sc-items | grep '\.sc' | cut -d '.' -f 1  >/tmp/all-sc-items
	# keep everything up to sc but lose anything after
	cat /tmp/possible-sc-items | grep '\.sc' | sed 's/\(.*\.sc\).*/\1/'  >/tmp/all-sc-items
else
	# simulate the common case
	# fix up odd variants ssharp  : germandbls
	# i_dot : assume idotaccent
	echo "reducing to only the '^sc.' small caps"
	cat /tmp/possible-sc-items | sed 's/^sc\.//' |
	sed -e 's/ssharp/germandbls/' -e 's/i_dot/idotaccent/' >/tmp/all-sc-items
fi

# now split into uni items and named items
echo "splitting into names and uniNNNN"
while read line
do
	# texgyre fonts have prefixed variations of sc combining tilde,
	# h_uni0303.sc l_uni0303.sc t_uni0303.sc : reduce to uni0303
	# Vollkorn has items like _part.cheabkhasiancy.sc
	#
	echo "$line" | grep -q 'uni'
	if [ $? -eq 0 ]; then
		# uni-sc-items can include uni0434.loclBGR.sc,
		# uni006A0301.sc (both from Vollkorn)
		echo $line | sed -e 's/^.*uni/uni/' -e 's/\(^uni....\).*/\1/' >>/tmp/uni-sc-items
	else
		# Strip out '_part.someglyph.sc'
		echo "$line" | grep -q '^_part'
		if [ $? -eq 0 ]; then
			# I don't know the carrect terminology
			echo "Ignoring $line, is an internal item, not a codepoint"
		else
			# Now strip off everything after the first '.'
			echo $line | sed 's/\..*//' >>/tmp/named-sc-items
		fi
	fi
done </tmp/all-sc-items

# it is possible that either file might have non-unique items
# so use sort -u for both, even though uni-sc-items are in order
# first, convert uni items to numbers
echo "converting uniNNNN to U+NNNN format"
cat /tmp/uni-sc-items | sed 's/uni/U+/' | sort -u > /tmp/uninum-sc-items

# the unicode data is ordered, A..Z,a..z
# so aim to read the table only once - in fact, simple
# repeated greps seem fast enough
echo "sorting named items into order"
cat /tmp/named-sc-items | sort -u >/tmp/alpha-sc-items

echo "Processing named items into unicode values"
# On a huge file, looping through the glyphs and matching might be worth
# the effort, but the number of small caps is not usually very large.
>/tmp/alnum-sc-items
while read line
do
	#ITEM=$(echo $line | cut -d ';' -f 1 | sed 's/_//g')
	ITEM=$(echo $line | cut -d ';' -f 1)
	#echo ITEM is $ITEM
	# Ligatures are reported as f_i f_l etc
	# but the glyphlist ha ff ffi ffi fl etc
	# it looks as if I'm losing another - Fira is reported to have 'brevecy' ?
	# I might be missing a few more, but this is probably adequate.
	grep -q "^$ITEM;" $GLYPHLIST
	if [ $? -ne 0 ]; then
		# ligatures may be f_f etc, glyphlist has ff
		SHORT=$(echo "$ITEM" | sed 's/_//g')
		grep -q "^$SHORT;" $GLYPHLIST
		if [ $? -eq 0 ]; then
			#use the short name
			ITEM=$SHORT
		else
			echo "Warning, assume $ITEM is a work item, not a codepoint"
		fi
	fi
	grep "^$ITEM;" $GLYPHLIST | cut -d ';' -f2 | sed 's/\(^.*\)/U+\1/' >>/tmp/alnum-sc-items
done < /tmp/alpha-sc-items

# Finish by merging using sort -u and writing to $1-sc.codepoints in $CWD
echo "final sort"
# fonts such as FreeSerif do not specify a separate weight, so remove '.*'
NAME=$(basename $1 | cut -d '-' -f1 | cut -d '.' -f1)
#echo NAME is $NAME

cat /tmp/alnum-sc-items /tmp/uninum-sc-items | awk '{ print $1 }' |
 sort -u >$NAME-sc.codepoints

exit
