Add caching support to verify-links (#2150)
- Update link checking pipeline to enable caching - Add loading cache file from a http endpoint - Add retry logic when hitting 429 Fix verify link renaming scenarios - Correctly error if link format is incorrect - Handle case where the replace regex is not passed - Improve the error logging so it is easier to identify the broken links
This commit is contained in:
parent
fdfc915565
commit
85e358a2ba
@ -22,9 +22,10 @@ steps:
|
||||
arguments: >
|
||||
-urls ${{ parameters.Urls }}
|
||||
-rootUrl "file://${{ parameters.WorkingDirectory }}/${{ parameters.Directory }}"
|
||||
-recursive: ${{ parameters.Recursive }}
|
||||
-recursive: ${{ parameters.Recursive }}
|
||||
-ignoreLinksFile ${{ parameters.IgnoreLinksFile }}
|
||||
-branchReplaceRegex "${{ parameters.BranchReplaceRegex }}"
|
||||
-branchReplacementName ${{ parameters.BranchReplacementName }}
|
||||
-devOpsLogging: $true
|
||||
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
|
||||
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
|
||||
-inputCacheFile "https://azuresdkartifacts.blob.core.windows.net/verify-links-cache/verify-links-cache.txt"
|
||||
@ -12,47 +12,48 @@
|
||||
Specifies the file that contains a set of links to ignore when verifying.
|
||||
|
||||
.PARAMETER devOpsLogging
|
||||
Switch that will enable devops specific logging for warnings
|
||||
Switch that will enable devops specific logging for warnings
|
||||
|
||||
.PARAMETER recursive
|
||||
Check the links recurisvely based on recursivePattern.
|
||||
|
||||
Check the links recurisvely based on recursivePattern.
|
||||
|
||||
.PARAMETER baseUrl
|
||||
Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
|
||||
|
||||
|
||||
.PARAMETER rootUrl
|
||||
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
|
||||
|
||||
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
|
||||
|
||||
.PARAMETER errorStatusCodes
|
||||
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
|
||||
|
||||
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
|
||||
|
||||
.PARAMETER branchReplaceRegex
|
||||
Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$
|
||||
|
||||
|
||||
.PARAMETER branchReplacementName
|
||||
The substitute branch name or SHA commit.
|
||||
|
||||
The substitute branch name or SHA commit.
|
||||
|
||||
.PARAMETER checkLinkGuidance
|
||||
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
|
||||
|
||||
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
|
||||
|
||||
.PARAMETER userAgent
|
||||
UserAgent to be configured for web requests. Defaults to current Chrome version.
|
||||
|
||||
.INPUTS
|
||||
None. No required inputs.
|
||||
.PARAMETER inputCacheFile
|
||||
Path to a file that contains a list of links that are known valid so we can skip checking them.
|
||||
|
||||
.OUTPUTS
|
||||
None. Verify-Links.ps1 does not generate any output.
|
||||
.PARAMETER outputCacheFile
|
||||
Path to a file that the script will output all the validated links after running all checks.
|
||||
|
||||
.EXAMPLE
|
||||
PS> .\Verify-Links.ps1
|
||||
PS> .\Verify-Links.ps1 C:\README.md
|
||||
|
||||
.EXAMPLE
|
||||
PS> .\Verify-Links.ps1 -urls C:\README.md
|
||||
PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
|
||||
|
||||
.EXAMPLE
|
||||
PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true
|
||||
PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[string[]] $urls,
|
||||
[string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
|
||||
@ -64,7 +65,9 @@ param (
|
||||
[string] $branchReplaceRegex = "",
|
||||
[string] $branchReplacementName = "",
|
||||
[bool] $checkLinkGuidance = $false,
|
||||
[string] $userAgent
|
||||
[string] $userAgent,
|
||||
[string] $inputCacheFile,
|
||||
[string] $outputCacheFile
|
||||
)
|
||||
|
||||
$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
|
||||
@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){
|
||||
}
|
||||
|
||||
if ($script:rootUrl -eq "") {
|
||||
if ($uri.IsFile) {
|
||||
if ($uri.IsFile) {
|
||||
# for files default to the containing directory
|
||||
$script:rootUrl = $script:baseUrl;
|
||||
}
|
||||
@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
|
||||
# If the link is mailto, skip it.
|
||||
if ($link.StartsWith("mailto:")) {
|
||||
Write-Verbose "Skipping $link because it is a mailto link."
|
||||
return $null
|
||||
return
|
||||
}
|
||||
|
||||
$linkUri = [System.Uri]$link;
|
||||
@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
|
||||
# If the link is not a web request, like mailto, skip it.
|
||||
if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
|
||||
Write-Verbose "Skipping $linkUri because it is not http or file based."
|
||||
return $null
|
||||
return
|
||||
}
|
||||
|
||||
if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
|
||||
Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
|
||||
return $null
|
||||
return
|
||||
}
|
||||
|
||||
return $linkUri;
|
||||
@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent)
|
||||
#$hrefs | Foreach-Object { Write-Host $_ }
|
||||
|
||||
Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
|
||||
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique
|
||||
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }
|
||||
|
||||
#$links | Foreach-Object { Write-Host $_ }
|
||||
|
||||
return $links
|
||||
}
|
||||
|
||||
function CheckLink ([System.Uri]$linkUri)
|
||||
function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
|
||||
{
|
||||
if(!$linkUri.ToString().Trim()) {
|
||||
LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
|
||||
return $false
|
||||
}
|
||||
if ($checkedLinks.ContainsKey($linkUri)) {
|
||||
if (!$checkedLinks[$linkUri]) {
|
||||
LogWarning "broken link $linkUri"
|
||||
|
||||
$originalLinkUri = $linkUri
|
||||
$linkUri = ReplaceGithubLink $linkUri
|
||||
|
||||
$link = $linkUri.ToString()
|
||||
|
||||
if ($checkedLinks.ContainsKey($link)) {
|
||||
if (!$checkedLinks[$link]) {
|
||||
LogWarning "broken link $link"
|
||||
}
|
||||
return $checkedLinks[$linkUri]
|
||||
return $checkedLinks[$link]
|
||||
}
|
||||
|
||||
$linkValid = $true
|
||||
Write-Verbose "Checking link $linkUri..."
|
||||
Write-Verbose "Checking link $linkUri..."
|
||||
|
||||
if ($linkUri.IsFile) {
|
||||
if (!(Test-Path $linkUri.LocalPath)) {
|
||||
@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri)
|
||||
}
|
||||
|
||||
if ($statusCode -in $errorStatusCodes) {
|
||||
LogWarning "[$statusCode] broken link $linkUri"
|
||||
if ($originalLinkUri -ne $linkUri) {
|
||||
LogWarning "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
|
||||
}
|
||||
else {
|
||||
LogWarning "[$statusCode] broken link $linkUri"
|
||||
}
|
||||
|
||||
$linkValid = $false
|
||||
}
|
||||
else {
|
||||
if ($null -ne $statusCode) {
|
||||
Write-Host "[$statusCode] while requesting $linkUri"
|
||||
# For 429 rate-limiting try to pause if possible
|
||||
if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
|
||||
$retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds
|
||||
|
||||
# Default retry after 60 (arbitrary) seconds if no header given
|
||||
if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
|
||||
Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"
|
||||
|
||||
Start-Sleep -Seconds $retryAfter
|
||||
$linkValid = CheckLink $originalLinkUri -allowRetry $false
|
||||
}
|
||||
else {
|
||||
Write-Host "[$statusCode] handled while requesting $linkUri"
|
||||
# Override and set status code in the cache so it is truthy
|
||||
# so we don't keep checking but we don't think it is valid either
|
||||
$linkValid = $statusCode
|
||||
}
|
||||
}
|
||||
else {
|
||||
Write-Host "Exception while requesting $linkUri"
|
||||
Write-Host $_.Exception.ToString()
|
||||
# Override and set exception in the cache so it is truthy
|
||||
# so we don't keep checking but we don't think it is valid either
|
||||
$linkValid = "Exception"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
elseif ($link.StartsWith("#")) {
|
||||
# Ignore anchor links as we don't have a great way to check them.
|
||||
}
|
||||
else {
|
||||
LogWarning "Link has invalid format $linkUri"
|
||||
$linkValid = $false
|
||||
}
|
||||
|
||||
if ($checkLinkGuidance) {
|
||||
if ($linkUri.Scheme -eq 'http') {
|
||||
LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
|
||||
$linkValid = $false
|
||||
}
|
||||
$link = $linkUri.ToString()
|
||||
# Check if the url is relative links, suppress the archor link validation.
|
||||
if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
|
||||
LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
|
||||
@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri)
|
||||
}
|
||||
}
|
||||
|
||||
$checkedLinks[$linkUri] = $linkValid
|
||||
$checkedLinks[$link] = $linkValid
|
||||
return $linkValid
|
||||
}
|
||||
|
||||
function ReplaceGithubLink([string]$originLink) {
|
||||
if (!$branchReplacementName) {
|
||||
if (!$branchReplacementName -or !$branchReplaceRegex) {
|
||||
return $originLink
|
||||
}
|
||||
$ReplacementPattern = "`${1}$branchReplacementName`$2"
|
||||
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
|
||||
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
|
||||
}
|
||||
|
||||
function GetLinks([System.Uri]$pageUri)
|
||||
@ -327,7 +367,7 @@ if ($urls) {
|
||||
if ($urls.Count -eq 0) {
|
||||
Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($PSVersionTable.PSVersion.Major -lt 6)
|
||||
@ -335,17 +375,47 @@ if ($PSVersionTable.PSVersion.Major -lt 6)
|
||||
LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
|
||||
}
|
||||
$ignoreLinks = @();
|
||||
if (Test-Path $ignoreLinksFile)
|
||||
{
|
||||
$ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" })
|
||||
if (Test-Path $ignoreLinksFile) {
|
||||
$ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
|
||||
}
|
||||
|
||||
$checkedPages = @{};
|
||||
$checkedLinks = @{};
|
||||
$badLinks = @{};
|
||||
# Use default hashtable constructor instead of @{} because we need them to be case sensitive
|
||||
$checkedPages = New-Object Hashtable
|
||||
$checkedLinks = New-Object Hashtable
|
||||
|
||||
if ($inputCacheFile)
|
||||
{
|
||||
$cacheContent = ""
|
||||
if ($inputCacheFile.StartsWith("http")) {
|
||||
try {
|
||||
$response = Invoke-WebRequest -Uri $inputCacheFile
|
||||
$cacheContent = $response.Content
|
||||
}
|
||||
catch {
|
||||
$statusCode = $_.Exception.Response.StatusCode.value__
|
||||
Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile"
|
||||
}
|
||||
}
|
||||
elseif (Test-Path $inputCacheFile) {
|
||||
$cacheContent = Get-Content $inputCacheFile -Raw
|
||||
}
|
||||
$goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
|
||||
|
||||
foreach ($goodLink in $goodLinks) {
|
||||
$checkedLinks[$goodLink] = $true
|
||||
}
|
||||
}
|
||||
|
||||
$cachedLinksCount = $checkedLinks.Count
|
||||
|
||||
if ($cachedLinksCount) {
|
||||
Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
|
||||
}
|
||||
|
||||
$badLinks = New-Object Hashtable
|
||||
$pageUrisToCheck = new-object System.Collections.Queue
|
||||
foreach ($url in $urls) {
|
||||
$uri = NormalizeUrl $url
|
||||
$uri = NormalizeUrl $url
|
||||
$pageUrisToCheck.Enqueue($uri);
|
||||
}
|
||||
|
||||
@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0)
|
||||
Write-Host "Found $($linkUris.Count) links on page $pageUri";
|
||||
$badLinksPerPage = @();
|
||||
foreach ($linkUri in $linkUris) {
|
||||
$replacedLink = ReplaceGithubLink $linkUri
|
||||
$isLinkValid = CheckLink $replacedLink
|
||||
$isLinkValid = CheckLink $linkUri
|
||||
if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
|
||||
if (!$linkUri.ToString().Trim()) {
|
||||
$linkUri = $emptyLinkMessage
|
||||
@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) {
|
||||
}
|
||||
}
|
||||
|
||||
$linksChecked = $checkedLinks.Count - $cachedLinksCount
|
||||
|
||||
if ($badLinks.Count -gt 0) {
|
||||
LogError "Found $($checkedLinks.Count) links with $($badLinks.Count) page(s) broken."
|
||||
}
|
||||
else {
|
||||
Write-Host "Found $($checkedLinks.Count) links. No broken links found."
|
||||
LogError "Checked $linksChecked links with $($badLinks.Count) page(s) broken."
|
||||
}
|
||||
else {
|
||||
Write-Host "Checked $linksChecked links. No broken links found."
|
||||
}
|
||||
|
||||
if ($outputCacheFile)
|
||||
{
|
||||
$goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object
|
||||
|
||||
Write-Host "Writing the list of validated links to $outputCacheFile"
|
||||
$goodLinks | Set-Content $outputCacheFile
|
||||
}
|
||||
|
||||
exit $badLinks.Count
|
||||
|
||||
Loading…
Reference in New Issue
Block a user