#!/bin/bash # +skip_license_check # Copyright 2017 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This script will scan all md (markdown) files for bad references. # It will look for strings of the form [...](...) and make sure that # the (...) points to either a valid file in the source tree or, in the # case of it being an http url, it'll make sure we don't get a 404. # # Usage: verify-links.sh [ dir | file ... ] # default arg is root of our source tree set -o errexit set -o nounset set -o pipefail REPO_ROOT=$(dirname "${BASH_SOURCE}")/.. if [ "$*" != "" ]; then args="$*" else args="${REPO_ROOT}" fi mdFiles=$(find "${args}" -name "*.md" | grep -v vendor | grep -v glide) tmp=$(mktemp) for file in ${mdFiles}; do # echo scanning $file dir=$(dirname $file) # Replace ) with )\n so that each possible href is on its own line. # Then only grab lines that have [..](..) in them - put results in tmp file. # If the file doesn't have any lines with [..](..) then skip this file sed "s/)/)\n/g" < $file | grep "\[.*\](.*)" > ${tmp}1 || continue # This sed will extract the href portion of the [..](..) - meaning # the stuff in the parens. sed "s/.*\(\[[^\[\]*\]([^()]*)\)/\1/" < ${tmp}1 > ${tmp}2 || continue # Extract all headings/anchors. # And strip off the leading #'s and leading/trailing blanks grep "^ *#" < $file | sed "s/ *#* *\(.*\) *$/\1/" > ${tmp}anchors # Now convert the header to what the anchor will look like. # - lower case stuff # - convert spaces to -'s # - remove punctuation marks (only accept 0-9, a-z cat ${tmp}anchors | \ tr '[:upper:]' '[:lower:]' | \ sed "s/ /-/g" | \ sed "s/[^-a-zA-Z0-9]//g" > ${tmp}anchors1 cat ${tmp}2 | while read line ; do # Strip off the leading and trailing parens ref=${line#*(} ref=${ref%)*} # An external href (ie. starts with http) if [ "${ref:0:4}" == "http" ]; then if ! wget --timeout 10 -o /dev/null ${ref} > /dev/null 2>&1 ; then echo $file: Can\'t load: url ${ref} | tee -a ${tmp}3 fi continue fi # Local file href - skip for now. # TODO add support for checking these if [ "${ref:0:1}" == "#" ]; then ref=${ref:1} if ! grep "^$ref$" ${tmp}anchors1 > /dev/null 2>&1 ; then echo $file: Can\'t find anchor \'\#${ref}\' | tee -a ${tmp}3 fi continue fi # Remove everything after # (aka section of page) ref=${ref%#*} newPath=${dir}/${ref} # And finally make sure the file is there # debug line: echo ref: $ref "->" $newPath if ! ls "${newPath}" > /dev/null 2>&1 ; then echo $file: Can\'t find: ${newPath} | tee -a ${tmp}3 failed=true fi done done rc=0 if [ -a ${tmp}3 ]; then rc=1 fi rm -f ${tmp}* exit $rc