: ########################################################################## # Shellscript: striphtml - remove HTML tags from input # Version : $Revision: 1.1.1.1 $ # Author : Heiner Steven (stv) # Category : HTML, File Conversion # Date : 09.03.1999 # RCS-Id. : $Id: striphtml.txt,v 1.1.1.1 1999/06/15 19:29:16 heiner Exp $ ########################################################################## # Description # ########################################################################## PN=`basename "$0"` # Program name VER=`echo '$Revision: 1.1.1.1 $' | cut -d\ -f2` Usage () { echo >&2 "$PN - strip HTML tags from input, $VER (stv '99) usage: $PN [file ...]" exit 1 } Msg () { for MsgLine do echo "$PN: $MsgLine" >&2 done } Fatal () { Msg "$@"; exit 1; } set -- `getopt h "$@"` [ $# -lt 1 ] && Usage # "getopt" detected an error while [ $# -gt 0 ] do case "$1" in # your flags here --) shift; break;; -h) Usage;; -*) Usage;; *) break;; # First file name esac shift done # Transform the input the following way: # 1. Replace "
" tags with a newline (special handling) # 2. Remove all Tags delimited by "<..>" # 3. Remove character entities (i.e. "©" or " ") sed -e 's:<[bB][rR]>:\ :g' \ -e 's:<[^>]*>::g' \ -e 's:&..;::g; s:&...;::g; s:&....;::g' \ "$@"