From 0374ce317079ff1d27702d7b9b6e7061fa86d827 Mon Sep 17 00:00:00 2001 From: Bevan Arps Date: Wed, 24 May 2023 09:33:58 +1200 Subject: [PATCH 1/6] Add BaseURL configuration --- README.md | 1 + htmldoc/document_store.go | 2 ++ htmltest/options.go | 2 ++ 3 files changed, 5 insertions(+) diff --git a/README.md b/README.md index 150e6c7..771e9a4 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo | `DirectoryIndex` | The file to look for when linking to a directory. | `index.html` | | `FilePath` | Single file to test within `DirectoryPath`, omit to test all. | | | `FileExtension` | Extension of your HTML documents, includes the dot. If `FilePath` is set we use the extension from that. | `.html` | +| `BaseURL` | Publication URL of the site, including subfolder if applicable. | | | `CheckDoctype` | Enables checking the document type declaration. | `true` | | `CheckAnchors` | Enables checking ` Date: Wed, 24 May 2023 09:36:19 +1200 Subject: [PATCH 2/6] Resolve filepath relative to BaseURL subfolder if present --- htmldoc/document_store.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/htmldoc/document_store.go b/htmldoc/document_store.go index 6987c70..59533c4 100644 --- a/htmldoc/document_store.go +++ b/htmldoc/document_store.go @@ -8,6 +8,7 @@ import ( "os" "path" "regexp" + "strings" "github.com/wjdp/htmltest/output" ) @@ -106,7 +107,17 @@ func (dS *DocumentStore) ResolvePath(refPath string) (*Document, bool) { if refPath[0] == '/' && len(refPath) > 1 { // Is an absolute link, remove the leading slash for map lookup - refPath = refPath[1:] + if dS.BaseURL == nil { + // No base URL, so `/` means our root + refPath = refPath[1:] + } else { + // We have a Base URL, so need to trip off the base path if present + refPath = strings.TrimPrefix(refPath, dS.BaseURL.Path) + + // We want to end up with a relative path, so remove leading '/' if present + // (This happens if BaseURL does *not* end in '/') + refPath = strings.TrimPrefix(refPath, "/") + } } // Try path as-is, path.ext From bf9cb5a558bf307842444e9ddef1bd9e34567c60 Mon Sep 17 00:00:00 2001 From: Bevan Arps Date: Wed, 24 May 2023 10:04:40 +1200 Subject: [PATCH 3/6] Add tests for subfolder paths --- htmltest/check-link_test.go | 27 +++++++++++++++++++ .../absoluteBrokenLinksFolderPublication.html | 2 ++ htmltest/fixtures/links/absoluteLinks.html | 2 ++ .../links/absoluteLinksFolderPublication.html | 2 ++ .../links/absoluteLinksRootPublication.html | 2 ++ 5 files changed, 35 insertions(+) create mode 100644 htmltest/fixtures/links/absoluteBrokenLinksFolderPublication.html create mode 100644 htmltest/fixtures/links/absoluteLinks.html create mode 100644 htmltest/fixtures/links/absoluteLinksFolderPublication.html create mode 100644 htmltest/fixtures/links/absoluteLinksRootPublication.html diff --git a/htmltest/check-link_test.go b/htmltest/check-link_test.go index 3322b02..9a7b944 100644 --- a/htmltest/check-link_test.go +++ b/htmltest/check-link_test.go @@ -767,6 +767,33 @@ func TestAnchorBlankHTML4(t *testing.T) { tExpectIssueCount(t, hT2, 1) } +func TestAnchorInternalAbsolute(t *testing.T) { + // works for internal absolute links + hT := tTestFile("fixtures/links/absoluteLinks.html") + tExpectIssueCount(t, hT, 0) +} + +func TestAnchorInternalAbsoluteRootPublication(t *testing.T) { + // works for internal absolute links when site is published to root of domain + hT := tTestFileOpts("fixtures/links/absoluteLinksRootPublication.html", + map[string]interface{}{"BaseURL": "http://example.com"}) + tExpectIssueCount(t, hT, 0) +} + +func TestAnchorInternalAbsoluteFolderPublication(t *testing.T) { + // works for internal absolute links when site is published to a folder + hT := tTestFileOpts("fixtures/links/absoluteLinksFolderPublication.html", + map[string]interface{}{"BaseURL": "http://www.example.com/blog"}) + tExpectIssueCount(t, hT, 0) +} + +func TestAnchorInternalBrokenAbsoluteFolderPublication(t *testing.T) { + // works for missing internal absolute links when site is published to a folder + hT := tTestFileOpts("fixtures/links/absoluteBrokenLinksFolderPublication.html", + map[string]interface{}{"BaseURL": "http://www.example.com/blog"}) + tExpectIssueCount(t, hT, 2) +} + func TestSelfSignedLink(t *testing.T) { tSkipShortExternal(t) hT := tTestFileOpts("fixtures/links/selfSignedLink.html", diff --git a/htmltest/fixtures/links/absoluteBrokenLinksFolderPublication.html b/htmltest/fixtures/links/absoluteBrokenLinksFolderPublication.html new file mode 100644 index 0000000..542bb7f --- /dev/null +++ b/htmltest/fixtures/links/absoluteBrokenLinksFolderPublication.html @@ -0,0 +1,2 @@ +Missing file in root of site +Missing file in folder of site \ No newline at end of file diff --git a/htmltest/fixtures/links/absoluteLinks.html b/htmltest/fixtures/links/absoluteLinks.html new file mode 100644 index 0000000..bd32420 --- /dev/null +++ b/htmltest/fixtures/links/absoluteLinks.html @@ -0,0 +1,2 @@ +Relative to root +Also relative to root \ No newline at end of file diff --git a/htmltest/fixtures/links/absoluteLinksFolderPublication.html b/htmltest/fixtures/links/absoluteLinksFolderPublication.html new file mode 100644 index 0000000..25aba9e --- /dev/null +++ b/htmltest/fixtures/links/absoluteLinksFolderPublication.html @@ -0,0 +1,2 @@ +Relative to root +Also relative to root \ No newline at end of file diff --git a/htmltest/fixtures/links/absoluteLinksRootPublication.html b/htmltest/fixtures/links/absoluteLinksRootPublication.html new file mode 100644 index 0000000..bd32420 --- /dev/null +++ b/htmltest/fixtures/links/absoluteLinksRootPublication.html @@ -0,0 +1,2 @@ +Relative to root +Also relative to root \ No newline at end of file From e539141b9fa6dd9cd361e81d8f9529a710410156 Mon Sep 17 00:00:00 2001 From: Bevan Arps Date: Wed, 24 May 2023 10:18:00 +1200 Subject: [PATCH 4/6] Enable checking of external links to self --- README.md | 1 + htmltest/check-link.go | 18 ++++++++++++++++++ htmltest/check-link_test.go | 10 ++++++++++ .../links/externalLinksToInternalFiles.html | 2 ++ htmltest/options.go | 15 ++++++++------- 5 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 htmltest/fixtures/links/externalLinksToInternalFiles.html diff --git a/README.md b/README.md index 771e9a4..c3444a9 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo | `CheckTel` | Enables–albeit quite basic–`tel:` link checking. | `true` | | `CheckFavicon` | Enables favicon checking, ensures every page has a favicon set. | `false` | | `CheckMetaRefresh` | Enables checking meta refresh tags. | `true` | +| `CheckSelfReferencesAsInternal` | Check external references starting with `BaseURL` as though they are internal references. | `false` | | `EnforceHTML5` | Fails when the doctype isn't ``. | `false` | | `EnforceHTTPS` | Fails when encountering an `http://` link. Useful to prevent mixed content errors when serving over HTTPS. | `false` | | `IgnoreURLs` | Array of regexs of URLs to ignore. | empty | diff --git a/htmltest/check-link.go b/htmltest/check-link.go index aacdaa1..75a660f 100644 --- a/htmltest/check-link.go +++ b/htmltest/check-link.go @@ -139,6 +139,24 @@ func (hT *HTMLTest) checkExternal(ref *htmldoc.Reference) { return } + // Is this an external reference to a local file? + if hT.opts.CheckSelfReferencesAsInternal && hT.documentStore.BaseURL != nil { + + if ref.URL.Host == hT.documentStore.BaseURL.Host && hT.documentStore.BaseURL.User == nil { + // Convert to internal reference + internalURL := *ref.URL + internalURL.Scheme = "" + internalURL.Host = "" + + internalRef := *ref + internalRef.URL = &internalURL + internalRef.Path = internalURL.String() + + hT.checkInternal(&internalRef) + return + } + } + urlStr := ref.URLString() // Does this url match an url ignore rule? diff --git a/htmltest/check-link_test.go b/htmltest/check-link_test.go index 9a7b944..45423f4 100644 --- a/htmltest/check-link_test.go +++ b/htmltest/check-link_test.go @@ -794,6 +794,16 @@ func TestAnchorInternalBrokenAbsoluteFolderPublication(t *testing.T) { tExpectIssueCount(t, hT, 2) } +func TestAnchorExternalLinksToInternalFiles(t *testing.T) { + // works for external links that reference internal files + hT := tTestFileOpts("fixtures/links/externalLinksToInternalFiles.html", + map[string]interface{}{ + "BaseURL": "http://www.example.com/blog", + "CheckSelfReferencesAsInternal": true, + }) + tExpectIssueCount(t, hT, 0) +} + func TestSelfSignedLink(t *testing.T) { tSkipShortExternal(t) hT := tTestFileOpts("fixtures/links/selfSignedLink.html", diff --git a/htmltest/fixtures/links/externalLinksToInternalFiles.html b/htmltest/fixtures/links/externalLinksToInternalFiles.html new file mode 100644 index 0000000..abe3ea6 --- /dev/null +++ b/htmltest/fixtures/links/externalLinksToInternalFiles.html @@ -0,0 +1,2 @@ +External HTTP reference +External HTTPS reference \ No newline at end of file diff --git a/htmltest/options.go b/htmltest/options.go index 2e24c8c..3e5dd3b 100644 --- a/htmltest/options.go +++ b/htmltest/options.go @@ -29,13 +29,14 @@ type Options struct { CheckMeta bool CheckGeneric bool - CheckExternal bool - CheckInternal bool - CheckInternalHash bool - CheckMailto bool - CheckTel bool - CheckFavicon bool - CheckMetaRefresh bool + CheckExternal bool + CheckInternal bool + CheckInternalHash bool + CheckMailto bool + CheckTel bool + CheckFavicon bool + CheckMetaRefresh bool + CheckSelfReferencesAsInternal bool EnforceHTML5 bool EnforceHTTPS bool From eae639897ba65930e2e6a31c20934535eac88874 Mon Sep 17 00:00:00 2001 From: Bevan Arps Date: Wed, 24 May 2023 10:18:17 +1200 Subject: [PATCH 5/6] Fixup BaseURL --- htmltest/htmltest.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/htmltest/htmltest.go b/htmltest/htmltest.go index e30bc62..3cec597 100644 --- a/htmltest/htmltest.go +++ b/htmltest/htmltest.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "net/http" + "net/url" "os" "path" "strings" @@ -149,6 +150,17 @@ func Test(optsUser map[string]interface{}) (*HTMLTest, error) { hT.documentStore.DirectoryIndex = hT.opts.DirectoryIndex hT.documentStore.IgnorePatterns = hT.opts.IgnoreDirs hT.documentStore.IgnoreTagAttribute = hT.opts.IgnoreTagAttribute + + if hT.opts.BaseURL != "" { + baseURL, err := url.Parse(hT.opts.BaseURL) + if err != nil { + err := fmt.Errorf("Could not parse BaseURL '%s': %w", hT.opts.BaseURL, err) + return &hT, err + } + + hT.documentStore.BaseURL = baseURL + } + // Discover documents hT.documentStore.Discover() From 88e3d44e17d5b25a18eed4f36c773c209b3e9e5c Mon Sep 17 00:00:00 2001 From: Bevan Arps Date: Tue, 30 May 2023 14:35:39 +1200 Subject: [PATCH 6/6] Fix external->internal bug --- htmltest/check-link.go | 9 ++++++--- htmltest/check-link_test.go | 10 ++++++++++ .../fixtures/links/externalLinksToExternalPages.html | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 htmltest/fixtures/links/externalLinksToExternalPages.html diff --git a/htmltest/check-link.go b/htmltest/check-link.go index 75a660f..5f20122 100644 --- a/htmltest/check-link.go +++ b/htmltest/check-link.go @@ -17,8 +17,9 @@ import ( ) // ignoredRels: List of rel values to ignore, dns-prefetch and preconnect are ignored as they are not links to be -// followed rather telling browser we want something on that host, if the root of that host is not valid, -// it's likely not a problem. +// +// followed rather telling browser we want something on that host, if the root of that host is not valid, +// it's likely not a problem. var ignoredRels = [...]string{"dns-prefetch", "preconnect"} func (hT *HTMLTest) checkLink(document *htmldoc.Document, node *html.Node) { @@ -142,7 +143,9 @@ func (hT *HTMLTest) checkExternal(ref *htmldoc.Reference) { // Is this an external reference to a local file? if hT.opts.CheckSelfReferencesAsInternal && hT.documentStore.BaseURL != nil { - if ref.URL.Host == hT.documentStore.BaseURL.Host && hT.documentStore.BaseURL.User == nil { + if ref.URL.Host == hT.documentStore.BaseURL.Host && + hT.documentStore.BaseURL.User == nil && + strings.HasPrefix(ref.URL.Path, hT.documentStore.BaseURL.Path) { // Convert to internal reference internalURL := *ref.URL internalURL.Scheme = "" diff --git a/htmltest/check-link_test.go b/htmltest/check-link_test.go index 45423f4..82ce2d5 100644 --- a/htmltest/check-link_test.go +++ b/htmltest/check-link_test.go @@ -804,6 +804,16 @@ func TestAnchorExternalLinksToInternalFiles(t *testing.T) { tExpectIssueCount(t, hT, 0) } +func TestAnchorExternalLinksToExternalPages(t *testing.T) { + // works for external links that don't reference internal files because the subfolder is different + hT := tTestFileOpts("fixtures/links/externalLinksToExternalPages.html", + map[string]interface{}{ + "BaseURL": "http://www.example.com/blog", + "CheckSelfReferencesAsInternal": true, + }) + tExpectIssueCount(t, hT, 2) +} + func TestSelfSignedLink(t *testing.T) { tSkipShortExternal(t) hT := tTestFileOpts("fixtures/links/selfSignedLink.html", diff --git a/htmltest/fixtures/links/externalLinksToExternalPages.html b/htmltest/fixtures/links/externalLinksToExternalPages.html new file mode 100644 index 0000000..b1d90f5 --- /dev/null +++ b/htmltest/fixtures/links/externalLinksToExternalPages.html @@ -0,0 +1,2 @@ +Path 'folder' does not match, should not be mapped to internal file +Path 'issues' does not match, should not map to known file \ No newline at end of file