Skip to content

Commit

Permalink
Merge pull request #8 from merlincox/master
Browse files Browse the repository at this point in the history
add support for numeric HTML entities and Unix style line breaks
  • Loading branch information
k3a authored Jul 14, 2019
2 parents eef0eb5 + a307896 commit 9556150
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 6 deletions.
39 changes: 33 additions & 6 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,47 @@ import (
"bytes"
"regexp"
"strings"
"strconv"
)

const (
WIN_LBR = "\r\n"
UNIX_LBR = "\n"
)

var lbr = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)

func parseHTMLEntity(entName string) (string, bool) {
if r, ok := entity[entName]; ok {
return string(r), true
}

if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
digits := match[1]
n, err := strconv.Atoi(digits)
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
return string(rune(n)), true
}
}

return "", false
}

// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
func SetUnixLbr(b bool) {
if b {
lbr = UNIX_LBR
} else {
lbr = WIN_LBR
}
}

// HTMLEntitiesToText decodes HTML entities inside a provided
// string and returns decoded text
func HTMLEntitiesToText(htmlEntsText string) string {
Expand Down Expand Up @@ -96,7 +123,7 @@ func HTML2Text(html string) string {
switch {
// skip new lines and spaces adding a single space if not there yet
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
writeSpace(outBuf)
continue

Expand Down Expand Up @@ -144,20 +171,20 @@ func HTML2Text(html string) string {
tagName := strings.ToLower(html[tagStart:i])

if tagName == "/ul" {
outBuf.WriteString("\r\n")
outBuf.WriteString(lbr)
} else if tagName == "li" || tagName == "li/" {
outBuf.WriteString("\r\n")
outBuf.WriteString(lbr)
} else if headersRE.MatchString(tagName) {
if canPrintNewline {
outBuf.WriteString("\r\n\r\n")
outBuf.WriteString(lbr + lbr)
}
canPrintNewline = false
} else if tagName == "br" || tagName == "br/" {
// new line
outBuf.WriteString("\r\n")
outBuf.WriteString(lbr)
} else if tagName == "p" || tagName == "/p" {
if canPrintNewline {
outBuf.WriteString("\r\n\r\n")
outBuf.WriteString(lbr + lbr)
}
canPrintNewline = false
} else if badTagnamesRE.MatchString(tagName) {
Expand Down
14 changes: 14 additions & 0 deletions html2text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,25 @@ func TestHTML2Text(t *testing.T) {
So(HTMLEntitiesToText("&abcdefghij;"), ShouldEqual, "&abcdefghij;")
})

Convey("Numeric HTML Entities", func() {
So(HTMLEntitiesToText("&#39;single quotes&#39; and &#52765;"), ShouldEqual, "'single quotes' and 츝")
})

Convey("Full HTML structure", func() {
So(HTML2Text(``), ShouldEqual, "")
So(HTML2Text(`<html><head><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
So(HTML2Text(`we are not <script type="javascript"></script>interested in scripts`),
ShouldEqual, "we are not interested in scripts")
})

Convey("Switching Unix and Windows line breaks", func() {
SetUnixLbr(true)
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\nline\nbreaks")
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\n\nparagraphs")
SetUnixLbr(false)
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
})

})
}

0 comments on commit 9556150

Please sign in to comment.