switch to html2text() instead of strip_tags() when preparing FTS index

This commit is contained in:
Andrew Dolgov 2023-10-21 10:51:24 +03:00
parent 2b61052e87
commit 03e956132d
No known key found for this signature in database
GPG Key ID: 1A56B4FA25D4AF2A
73 changed files with 27833 additions and 17 deletions

View File

@ -90,7 +90,7 @@ class Article extends Handler_Protected {
SET tsvector_combined = to_tsvector( :ts_content)
WHERE id = :id");
$params = [
":ts_content" => mb_substr(strip_tags($content ), 0, 900000),
":ts_content" => mb_substr(\Soundasleep\Html2Text::convert($content), 0, 900000),
":id" => $ref_id];
$sth->execute($params);
}
@ -135,7 +135,7 @@ class Article extends Handler_Protected {
SET tsvector_combined = to_tsvector( :ts_content)
WHERE id = :id");
$params = [
":ts_content" => mb_substr(strip_tags($content ), 0, 900000),
":ts_content" => mb_substr(\Soundasleep\Html2Text::convert($content), 0, 900000),
":id" => $ref_id];
$sth->execute($params);
}

View File

@ -1184,7 +1184,7 @@ class RSSUtils {
if (Config::get(Config::DB_TYPE) == "pgsql") {
$params[":ts_lang"] = $feed_language;
$params[":ts_content"] = mb_substr(strip_tags($entry_title . " " . $entry_content), 0, 900000);
$params[":ts_content"] = mb_substr(strip_tags($entry_title) . " " . \Soundasleep\Html2Text::convert($entry_content), 0, 900000);
}
$sth->execute($params);

View File

@ -18,7 +18,8 @@
"mervick/material-design-icons": "^2.2",
"j4mie/idiorm": "dev-master",
"open-telemetry/exporter-otlp": "^1.0",
"php-http/guzzle7-adapter": "^1.0"
"php-http/guzzle7-adapter": "^1.0",
"soundasleep/html2text": "^2.1"
},
"require-dev": {
"phpstan/phpstan": "1.10.3",

57
composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "2c8b76f35398131c362d125ed47c8102",
"content-hash": "cbbbfbdbf1c5f659b8e34307411bc751",
"packages": [
{
"name": "beberlei/assert",
@ -1659,6 +1659,61 @@
},
"time": "2019-03-08T08:55:37+00:00"
},
{
"name": "soundasleep/html2text",
"version": "2.1.0",
"source": {
"type": "git",
"url": "https://github.com/soundasleep/html2text.git",
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/soundasleep/html2text/zipball/83502b6f8f1aaef8e2e238897199d64f284b4af3",
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-libxml": "*",
"php": "^7.3|^8.0"
},
"require-dev": {
"phpstan/phpstan": "^1.9",
"phpunit/phpunit": "^7.0|^8.0|^9.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Soundasleep\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Jevon Wright",
"homepage": "https://jevon.org",
"role": "Developer"
}
],
"description": "A PHP script to convert HTML into a plain text format",
"homepage": "https://github.com/soundasleep/html2text",
"keywords": [
"email",
"html",
"php",
"text"
],
"support": {
"email": "support@jevon.org",
"issues": "https://github.com/soundasleep/html2text/issues",
"source": "https://github.com/soundasleep/html2text/tree/2.1.0"
},
"time": "2023-01-06T09:28:15+00:00"
},
{
"name": "spomky-labs/otphp",
"version": "v10.0.3",

View File

@ -317,7 +317,7 @@
while (true) {
foreach ($entries as $entry) {
$tsvector_combined = mb_substr(strip_tags($entry->title . " " . $entry->content), 0, 1000000);
$tsvector_combined = mb_substr(strip_tags($entry->title) . " " . \Soundasleep\Html2Text::convert($entry->content), 0, 900000);
$usth->execute([$tsvector_combined, $entry->id]);
$processed++;
}

View File

@ -14,8 +14,9 @@ return array(
'Symfony\\Polyfill\\Php81\\' => array($vendorDir . '/symfony/polyfill-php81'),
'Symfony\\Polyfill\\Php80\\' => array($vendorDir . '/symfony/polyfill-php80'),
'Symfony\\Polyfill\\Mbstring\\' => array($vendorDir . '/symfony/polyfill-mbstring'),
'Soundasleep\\' => array($vendorDir . '/soundasleep/html2text/src'),
'Psr\\Log\\' => array($vendorDir . '/psr/log/src'),
'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-message/src', $vendorDir . '/psr/http-factory/src'),
'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-factory/src', $vendorDir . '/psr/http-message/src'),
'Psr\\Http\\Client\\' => array($vendorDir . '/psr/http-client/src'),
'Prophecy\\' => array($vendorDir . '/phpspec/prophecy/src/Prophecy'),
'PhpParser\\' => array($vendorDir . '/nikic/php-parser/lib/PhpParser'),

View File

@ -137,6 +137,7 @@ class ComposerStaticInit19fc2ff1c0f9a92279c7979386bb2056
'Symfony\\Polyfill\\Php81\\' => 23,
'Symfony\\Polyfill\\Php80\\' => 23,
'Symfony\\Polyfill\\Mbstring\\' => 26,
'Soundasleep\\' => 12,
),
'P' =>
array (
@ -219,14 +220,18 @@ class ComposerStaticInit19fc2ff1c0f9a92279c7979386bb2056
array (
0 => __DIR__ . '/..' . '/symfony/polyfill-mbstring',
),
'Soundasleep\\' =>
array (
0 => __DIR__ . '/..' . '/soundasleep/html2text/src',
),
'Psr\\Log\\' =>
array (
0 => __DIR__ . '/..' . '/psr/log/src',
),
'Psr\\Http\\Message\\' =>
array (
0 => __DIR__ . '/..' . '/psr/http-message/src',
1 => __DIR__ . '/..' . '/psr/http-factory/src',
0 => __DIR__ . '/..' . '/psr/http-factory/src',
1 => __DIR__ . '/..' . '/psr/http-message/src',
),
'Psr\\Http\\Client\\' =>
array (

View File

@ -3791,6 +3791,64 @@
],
"install-path": "../sebastian/version"
},
{
"name": "soundasleep/html2text",
"version": "2.1.0",
"version_normalized": "2.1.0.0",
"source": {
"type": "git",
"url": "https://github.com/soundasleep/html2text.git",
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/soundasleep/html2text/zipball/83502b6f8f1aaef8e2e238897199d64f284b4af3",
"reference": "83502b6f8f1aaef8e2e238897199d64f284b4af3",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-libxml": "*",
"php": "^7.3|^8.0"
},
"require-dev": {
"phpstan/phpstan": "^1.9",
"phpunit/phpunit": "^7.0|^8.0|^9.0"
},
"time": "2023-01-06T09:28:15+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Soundasleep\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Jevon Wright",
"homepage": "https://jevon.org",
"role": "Developer"
}
],
"description": "A PHP script to convert HTML into a plain text format",
"homepage": "https://github.com/soundasleep/html2text",
"keywords": [
"email",
"html",
"php",
"text"
],
"support": {
"email": "support@jevon.org",
"issues": "https://github.com/soundasleep/html2text/issues",
"source": "https://github.com/soundasleep/html2text/tree/2.1.0"
},
"install-path": "../soundasleep/html2text"
},
{
"name": "spomky-labs/otphp",
"version": "v10.0.3",

View File

@ -3,7 +3,7 @@
'name' => '__root__',
'pretty_version' => 'dev-master',
'version' => 'dev-master',
'reference' => '45a9ff0c88cbd33892ff16ab837e9059937d656e',
'reference' => '2b61052e8709283d89997e351173bcb43a3c2c61',
'type' => 'library',
'install_path' => __DIR__ . '/../../',
'aliases' => array(),
@ -13,7 +13,7 @@
'__root__' => array(
'pretty_version' => 'dev-master',
'version' => 'dev-master',
'reference' => '45a9ff0c88cbd33892ff16ab837e9059937d656e',
'reference' => '2b61052e8709283d89997e351173bcb43a3c2c61',
'type' => 'library',
'install_path' => __DIR__ . '/../../',
'aliases' => array(),
@ -371,8 +371,8 @@
'psr/http-client-implementation' => array(
'dev_requirement' => false,
'provided' => array(
0 => '*',
1 => '1.0',
0 => '1.0',
1 => '*',
),
),
'psr/http-factory' => array(
@ -387,8 +387,8 @@
'psr/http-factory-implementation' => array(
'dev_requirement' => false,
'provided' => array(
0 => '*',
1 => '1.0',
0 => '1.0',
1 => '*',
),
),
'psr/http-message' => array(
@ -403,8 +403,8 @@
'psr/http-message-implementation' => array(
'dev_requirement' => false,
'provided' => array(
0 => '*',
1 => '1.0',
0 => '1.0',
1 => '*',
),
),
'psr/log' => array(
@ -569,6 +569,15 @@
'aliases' => array(),
'dev_requirement' => true,
),
'soundasleep/html2text' => array(
'pretty_version' => '2.1.0',
'version' => '2.1.0.0',
'reference' => '83502b6f8f1aaef8e2e238897199d64f284b4af3',
'type' => 'library',
'install_path' => __DIR__ . '/../soundasleep/html2text',
'aliases' => array(),
'dev_requirement' => false,
),
'spomky-labs/otphp' => array(
'pretty_version' => 'v10.0.3',
'version' => '10.0.3.0',

View File

@ -0,0 +1,23 @@
# EditorConfig is awesome: http://EditorConfig.org
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
charset = utf-8
insert_final_newline = true
trim_trailing_whitespace = true
indent_style = tab
indent_size = 4
[*.md]
indent_style = space
indent_size = 2
# don't add newlines to test files
[tests/*]
indent_style = tabs
trim_trailing_whitespace = false
insert_final_newline = false

View File

@ -0,0 +1,17 @@
name: Lint
on:
- push
jobs:
lint:
name: Lint
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: '7.4'
tools: phplint
- name: Check syntax
run: phplint .

View File

@ -0,0 +1,41 @@
name: Test
on:
- push
jobs:
test:
strategy:
matrix:
operating-system:
- ubuntu-latest
php-version:
- '7.3'
- '7.4'
- '8.0'
- '8.1'
- '8.2'
name: php ${{ matrix.php-version }} on ${{ matrix.operating-system }}
runs-on: ${{ matrix.operating-system }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-version }}
extensions: mbstring
coverage: none
- name: Get composer cache directory
id: composer-cache
run: echo "::set-output name=dir::$(composer config cache-files-dir)"
- name: Setup composer cache
uses: actions/cache@v3
with:
path: ${{ steps.composer-cache.outputs.dir }}
key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }}
restore-keys: ${{ runner.os }}-composer-
- name: Install composer dependencies
env:
COMPOSER_AUTH: ${{ secrets.COMPOSER_AUTH }}
run: composer install --no-ansi --no-interaction --no-scripts --no-progress --prefer-dist
- name: Run tests
run: vendor/bin/phpunit

View File

@ -0,0 +1,7 @@
tests/*.output
*.sublime-project
*.sublime-workspace
vendor/
**/*.DS_Store
.phpunit.result.cache
composer.lock

View File

@ -0,0 +1,37 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.1.0] - 2019-02-15
### Added
- Zero-width non-joiners are now stripped to prevent output issues, similar to non-breaking whitespace
### Fixed
- Fix namespace in composer [#67](https://github.com/soundasleep/html2text/pull/67)
## [1.0.0] - 2019-02-14
### Added
- Added `drop_links` option to render links without the target href [#65](https://github.com/soundasleep/html2text/pull/65)
### Changed
- **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45)
- Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64)
- Second argument to `convert()` is now an array, rather than boolean [#65](https://github.com/soundasleep/html2text/pull/65)
- Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47)
- Upgrade PHP support to PHP 7.3+
- Upgrade PHPUnit to 7.x
- Re-release project under MIT license [#58](https://github.com/soundasleep/html2text/issues/58)
## [0.5.0] - 2017-04-20
### Added
- Add ignore_error optional argument [#63](https://github.com/soundasleep/html2text/pull/63)
- Blockquote support [#50](https://github.com/soundasleep/html2text/pull/50)
[Unreleased]: https://github.com/soundasleep/html2text/compare/1.1.0...HEAD
[1.1.0]: https://github.com/soundasleep/html2text/compare/1.0.0...1.1.0
[1.0.0]: https://github.com/soundasleep/html2text/compare/0.5.0...1.0.0
[0.5.0]: https://github.com/soundasleep/html2text/compare/0.5.0...0.3.4

21
vendor/soundasleep/html2text/LICENSE.md vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Jevon Wright
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

102
vendor/soundasleep/html2text/README.md vendored Normal file
View File

@ -0,0 +1,102 @@
![example workflow](https://github.com/soundasleep/html2text/actions/workflows/test.yml/badge.svg) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text)
=========
html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be
rendered by a browser - perfect for places where you need a quick text representation. For example:
```html
<html>
<title>Ignored Title</title>
<body>
<h1>Hello, World!</h1>
<p>This is some e-mail content.
Even though it has whitespace and newlines, the e-mail converter
will handle it correctly.
<p>Even mismatched tags.</p>
<div>A div</div>
<div>Another div</div>
<div>A div<div>within a div</div></div>
<a href="http://foo.com">A link</a>
</body>
</html>
```
Will be converted into:
```text
Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
Even mismatched tags.
A div
Another div
A div
within a div
[A link](http://foo.com)
```
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
## Installing
You can use [Composer](http://getcomposer.org/) to add the [package](https://packagist.org/packages/soundasleep/html2text) to your project:
```json
{
"require": {
"soundasleep/html2text": "~1.1"
}
}
```
And then use it quite simply:
```php
$text = \Soundasleep\Html2Text::convert($html);
```
You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead.
### Options
| Option | Default | Description |
|--------|---------|-------------|
| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. |
| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. |
| **char_set** | `'auto'` | Specify a specific character set. Pass multiple character sets (comma separated) to detect encoding, default is ASCII,UTF-8 |
Pass along options as a second argument to `convert`, for example:
```php
$options = array(
'ignore_errors' => true,
// other options go here
);
$text = \Soundasleep\Html2Text::convert($html, $options);
```
## Tests
Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`.
## Troubleshooting
### Class 'DOMDocument' not found
You need to [install the PHP XML extension](https://github.com/soundasleep/html2text/issues/55) for your PHP version. e.g. `apt-get install php7.4-xml`
## License
`html2text` is [licensed under MIT](LICENSE.md), making it suitable for both Eclipse and GPL projects.
## Other versions
Also see [html2text_ruby](https://github.com/soundasleep/html2text_ruby), a Ruby implementation.

View File

@ -0,0 +1,32 @@
{
"name": "soundasleep/html2text",
"description": "A PHP script to convert HTML into a plain text format",
"type": "library",
"keywords": [ "php", "html", "text", "email" ],
"homepage": "https://github.com/soundasleep/html2text",
"license": "MIT",
"authors": [
{
"name": "Jevon Wright",
"homepage": "https://jevon.org",
"role": "Developer"
}
],
"autoload": {
"psr-4": {
"Soundasleep\\": "src"
}
},
"support": {
"email": "support@jevon.org"
},
"require": {
"php": "^7.3|^8.0",
"ext-dom": "*",
"ext-libxml": "*"
},
"require-dev": {
"phpunit/phpunit": "^7.0|^8.0|^9.0",
"phpstan/phpstan": "^1.9"
}
}

View File

@ -0,0 +1,21 @@
<?php
/**
* This file allows you to convert through the command line.
* Usage:
* php -f convert.php [input file]
*/
if (count($argv) < 2) {
throw new \InvalidArgumentException("Expected: php -f convert.php [input file]");
}
if (!file_exists($argv[1])) {
throw new \InvalidArgumentException("'" . $argv[1] . "' does not exist");
}
$input = file_get_contents($argv[1]);
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
echo \Soundasleep\Html2Text::convert($input);

View File

@ -0,0 +1,16 @@
<?php
/**
* This file is available if you still want to use functions rather than
* autoloading classes.
*/
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
function convert_html_to_text($html, $ignore_error = false) {
return Soundasleep\Html2Text::convert($html, $ignore_error);
}
function fix_newlines($text) {
return Soundasleep\Html2Text::fixNewlines($text);
}

View File

@ -0,0 +1,7 @@
parameters:
level: 6
errorFormat: raw
editorUrl: '%%file%% %%line%% %%column%%: %%error%%'
paths:
- src
- tests

View File

@ -0,0 +1,8 @@
<phpunit stopOnFailure="true" stopOnError="true" beStrictAboutTestsThatDoNotTestAnything="false">
<testsuites>
<testsuite name="Tests">
<!-- loads all *Test.php -->
<directory>tests</directory>
</testsuite>
</testsuites>
</phpunit>

View File

@ -0,0 +1,540 @@
<?php
namespace Soundasleep;
class Html2Text {
/** @return array<string, bool | string> */
public static function defaultOptions(): array {
return [
'ignore_errors' => false,
'drop_links' => false,
'char_set' => 'auto'
];
}
/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param string $html the input HTML
* @param boolean|array<string, bool | string> $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert(string $html, $options = []): string {
if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = ['ignore_errors' => $options];
}
$options = array_merge(static::defaultOptions(), $options);
// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'. Valid options are " . implode(',', static::defaultOptions()));
}
}
$is_office_document = self::isOfficeDocument($html);
if ($is_office_document) {
// remove office namespace
$html = str_replace(["<o:p>", "</o:p>"], "", $html);
}
$html = self::fixNewlines($html);
// use mb_convert_encoding for legacy versions of php
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
$doc = self::getDocument($html, $options);
$output = self::iterateOverNode($doc, null, false, $is_office_document, $options);
// process output for whitespace/newlines
$output = self::processWhitespaceNewlines($output);
return $output;
}
/**
* Unify newlines; in particular, \r\n becomes \n, and
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
* all become \ns.
*
* @param string $text text with any number of \r, \r\n and \n combinations
* @return string the fixed text
*/
public static function fixNewlines(string $text): string {
// replace \r\n to \n
$text = str_replace("\r\n", "\n", $text);
// remove \rs
$text = str_replace("\r", "\n", $text);
return $text;
}
/** @return array<string> */
public static function nbspCodes(): array {
return [
"\xc2\xa0",
"\u00a0",
];
}
/** @return array<string> */
public static function zwnjCodes(): array {
return [
"\xe2\x80\x8c",
"\u200c",
];
}
/**
* Remove leading or trailing spaces and excess empty lines from provided multiline text
*
* @param string $text multiline text any number of leading or trailing spaces or excess lines
* @return string the fixed text
*/
public static function processWhitespaceNewlines(string $text): string {
// remove excess spaces around tabs
$text = preg_replace("/ *\t */im", "\t", $text);
// remove leading whitespace
$text = ltrim($text);
// remove leading spaces on each line
$text = preg_replace("/\n[ \t]*/im", "\n", $text);
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = self::renderText($text);
// remove trailing whitespace
$text = rtrim($text);
// remove trailing spaces on each line
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// unarmor pre blocks
$text = self::fixNewLines($text);
// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
return $text;
}
/**
* Can we guess that this HTML is generated by Microsoft Office?
*/
public static function isOfficeDocument(string $html): bool {
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}
public static function isWhitespace(string $text): bool {
return strlen(trim(self::renderText($text), "\n\r\t ")) === 0;
}
/**
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param array<string, bool | string> $options
* @return \DOMDocument the parsed document tree
*/
private static function getDocument(string $html, array $options): \DOMDocument {
$doc = new \DOMDocument();
$html = trim($html);
if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}
if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
$header = '';
// use char sets for modern versions of php
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81) {
// use specified char_set, or auto detect if not set
$char_set = ! empty($options['char_set']) ? $options['char_set'] : 'auto';
if ('auto' === $char_set) {
$char_set = mb_detect_encoding($html);
} else if (strpos($char_set, ',')) {
mb_detect_order($char_set);
$char_set = mb_detect_encoding($html);
}
// turn off error detection for Windows-1252 legacy html
if (strpos($char_set, '1252')) {
$options['ignore_errors'] = true;
}
$header = '<?xml version="1.0" encoding="' . $char_set . '">';
}
if (! empty($options['ignore_errors'])) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
$old_internal_errors = libxml_use_internal_errors(true);
$load_result = $doc->loadHTML($header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
libxml_use_internal_errors($old_internal_errors);
}
else {
$load_result = $doc->loadHTML($header . $html);
}
if (!$load_result) {
throw new Html2TextException("Could not load HTML - badly formed?", $html);
}
return $doc;
}
/**
* Replace any special characters with simple text versions, to prevent output issues:
* - Convert non-breaking spaces to regular spaces; and
* - Convert zero-width non-joiners to '' (nothing).
*
* This is to match our goal of rendering documents as they would be rendered
* by a browser.
*/
private static function renderText(string $text): string {
$text = str_replace(self::nbspCodes(), " ", $text);
$text = str_replace(self::zwnjCodes(), "", $text);
return $text;
}
private static function nextChildName(?\DOMNode $node): ?string {
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!self::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
/** @param array<string, bool | string> $options */
private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return str_replace("\n", "\r", $text);
}
$text = self::renderText($node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
$name = strtolower($node->nodeName);
$nextName = self::nextChildName($node);
// start whitespace
switch ($name) {
case "hr":
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
case "pre":
// add two newlines
$output = "\n\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
// @phpstan-ignore-next-line
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;
case "tr":
// add one line
$output = "\n";
break;
case "div":
$output = "";
if ($prevName !== null) {
// add one line
$output .= "\n";
}
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
$n = $node->childNodes->item(0);
$previousSiblingNames = [];
$previousSiblingName = null;
$parts = [];
$trailing_whitespace = 0;
while ($n != null) {
$text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && self::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace++;
}
else {
$previousSiblingName = strtolower($n->nodeName);
$previousSiblingNames[] = $previousSiblingName;
$trailing_whitespace = 0;
}
$node->removeChild($n);
$n = $node->childNodes->item(0);
$parts[] = $text;
}
// Remove trailing whitespace, important for the br check below
while ($trailing_whitespace-- > 0) {
array_pop($parts);
}
// suppress last br tag inside a node list if follows text
$last_name = array_pop($previousSiblingNames);
if ($last_name === 'br') {
$last_name = array_pop($previousSiblingNames);
if ($last_name === '#text') {
array_pop($parts);
}
}
$output .= implode('', $parts);
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "pre":
case "p":
// add two lines
$output .= "\n\n";
break;
case "br":
// add one line
$output .= "\n";
break;
case "div":
break;
case "a":
// links are returned in [text](link) format
// @phpstan-ignore-next-line
$href = $node->getAttribute("href");
$output = trim($output);
// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
// @phpstan-ignore-next-line
if ($node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
// @phpstan-ignore-next-line
if (!$output && $node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
// @phpstan-ignore-next-line
if ($node->getAttribute("name") != null) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]";
}
}
} else {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output = "$output";
} else {
// replace it
if ($output) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]($href)";
}
} else {
// empty string
$output = "$href";
}
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
break;
case "img":
// @phpstan-ignore-next-line
if ($node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = "[" . $node->getAttribute("title") . "]";
// @phpstan-ignore-next-line
} elseif ($node->getAttribute("alt")) {
// @phpstan-ignore-next-line
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
case "blockquote":
// process quoted text for whitespace/newlines
$output = self::processWhitespaceNewlines($output);
// add leading newline
$output = "\n" . $output;
// prepend '> ' at the beginning of all lines
$output = preg_replace("/\n/im", "\n> ", $output);
// replace leading '> >' with '>>'
$output = preg_replace("/\n> >/im", "\n>>", $output);
// add another leading newline and trailing newlines
$output = "\n" . $output . "\n\n";
break;
default:
// do nothing
}
return $output;
}
}

View File

@ -0,0 +1,15 @@
<?php
namespace Soundasleep;
class Html2TextException extends \Exception {
/** @var string $more_info */
public $more_info;
public function __construct(string $message = "", string $more_info = "") {
parent::__construct($message);
$this->more_info = $more_info;
}
}

View File

@ -0,0 +1,97 @@
<?php
require(__DIR__ . "/../src/Html2Text.php");
class Html2TextTest extends \PHPUnit\Framework\TestCase {
// delete all failures before we run
public static function setUpBeforeClass(): void {
foreach (new DirectoryIterator(__DIR__ . '/failures') as $fileInfo) {
if ($fileInfo->getFileName()[0] != '.') {
unlink($fileInfo->getPathname());
}
}
}
/**
* @dataProvider providerFiles
*/
public function testFile(string $test): void {
$this->doTestWithResults($test, $test, []);
}
/** @param bool | array<string, bool | string> $options */
function doTestWithResults(string $test, string $result, $options = []): void {
$html = __DIR__ . "/html/$test.html";
$txt = __DIR__ . "/txt/$result.txt";
$this->assertTrue(file_exists($html), "File '{$html}' does not exist");
$this->assertTrue(file_exists($txt), "File '{$txt}' does not exist");
$input = file_get_contents($html);
$expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents($txt));
$output = \Soundasleep\Html2Text::convert($input, $options);
if ($output != $expected) {
file_put_contents(__DIR__ . "/failures/$result.output", $output);
}
$this->assertEquals($expected, $output, "{$html} file failed to convert to {$txt}");
}
/** @return array<array<string>> */
public function providerFiles(): array {
return [
['basic'],
['anchors'],
['more-anchors'],
['test3'],
['test4'],
['table'],
['nbsp'],
['lists'],
['pre'],
['newlines'],
['nested-divs'],
['blockquotes'],
['full_email'],
['images'],
['non-breaking-spaces'],
['utf8-example'],
['msoffice'],
['dom-processing'],
['empty'],
['huge-msoffice'],
['zero-width-non-joiners'],
];
}
public function testInvalidXML(): void {
$this->expectWarning();
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => false]);
}
public function testInvalidXMLIgnore(): void {
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => true]);
}
public function testInvalidXMLIgnoreOldSyntax(): void {
// for BC, allow old #convert(text, bool) syntax
$this->doTestWithResults("invalid", "invalid", true);
}
public function testInvalidOption(): void {
$this->expectException(InvalidArgumentException::class);
$this->doTestWithResults("basic", "basic", ['invalid_option' => true]);
}
public function testBasicDropLinks(): void {
$this->doTestWithResults("basic", "basic.no-links", ['drop_links' => true]);
}
public function testAnchorsDropLinks(): void {
$this->doTestWithResults("anchors", "anchors.no-links", ['drop_links' => true]);
}
public function testWindows1252(): void {
$this->doTestWithResults("windows-1252-example", "windows-1252-example", ['char_set' => 'windows-1252']);
}
}

View File

@ -0,0 +1,8 @@
# Ignore everything
*
# But not these files...
!.gitignore
# ...even if they are in subdirectories
!*/

View File

@ -0,0 +1,12 @@
A document without any HTML open/closing tags.
<hr>
We try and use the representation given by common browsers of the
HTML document, so that it looks similar when converted to plain text.
<a href="http://foo.com">visit foo.com</a> - or <a href="http://www.foo.com">http://www.foo.com</a>
<a href="http://foo.com" title="a link with a title">link</a>
<h2><a name="anchor">An anchor which will not appear</a></h2>

View File

@ -0,0 +1,21 @@
<html>
<title>Ignored Title</title>
<body>
<h1>Hello, World!</h1>
<p>This is some e-mail content.
Even though it has whitespace and newlines, the e-mail converter
will handle it correctly.
<p>Even mismatched tags.</p>
<div>A div</div>
<div>Another div</div>
<div>A div<div>within a div</div></div>
<p>Another line<br />Yet another line</p>
<a href="http://foo.com">A link</a>
</body>
</html>

View File

@ -0,0 +1,43 @@
<span>Hello</span>
<blockquote>
Nest some block quotes with preformated text
<blockquote>
Here is the code
<pre>
#include &lt;stdlib.h&gt;
#include &lt;stdio.h&gt;
int main(){
return 0;
};
</pre>
<b>Put some tags</b>
<i>at the end</i>
</blockquote>
Some text <span>and tags</span> here
<blockquote>
First line
<h1>Header 1</h1>
Some text
<hr>
Some more text
<p>Paragraph tag!</p>
<h2>Header 2</h2>
<hr>
<h3>Header 3</h3>
Some text
<h4>Header 4</h4>
<blockquote>
More quoted text!
</blockquote>
<p>Paragraph tag!</p>
Final line
</blockquote>
</blockquote>
Some ending text
<b>just to make sure</b>

View File

@ -0,0 +1,8 @@
<html>
<body>
<?a
I am a random piece of code
?>
Hello
</body>
</html>

View File

View File

@ -0,0 +1,220 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=680">
</head>
<body class="cat-update-email cat-update" style="background: #ffccee; color: blue; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; text-align: center" bgcolor="#ffccee">
<style type="text/css">
body.cat-update-email {
margin: 0; padding: 0; background: #ffccee; color: blue; text-align: center;
}
body.cat-update-email {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
body.cat-update-email th {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
body.cat-update-email td {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
</style>
<table class="header-wrapper" style="border-spacing: 0; border: none; margin: 0; width: 100%">
<tr>
<td class="header" style="background: none; color: #999; font-family: Times New Roman; font-size: 12px; font-weight: normal; padding: 15px 0">
<table cellspacing="0" cellpadding="0" border="0" style="margin: 0 auto; padding: 0 20px; width: 640px">
<tr>
<th style="font-family: Times New Roman; font-size: 12px; font-weight: normal">
<a class="logo" href="http://localhost/home" style="color: red; text-decoration: none">
<img border="0" height="32" src="test.png" width="200" style="display: block">
</a> </th>
<td class="account-number" style="color: white; font-family: Times New Roman; font-size: 12px; font-weight: normal; text-align: right" align="right">
16 December 2015<br>
Account 123
</td>
</tr>
</table>
</td>
</tr>
</table>
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="salutation section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<h1 class="user_greeting" style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 1em">
Hi Susan
</h1>
<p class="message" style="font-size: 1.5em; line-height: 1.2; margin: 0">
Here is your cat report.
</p>
</td>
</tr>
</table>
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="balance section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<div class="account-status-heading" style="font-size: 2.5em; line-height: 1em; padding: 30px 20px; text-align: center" align="center">You have found <span class="status-cats-negative" style="color: #df0000">5 cats</span> less than anyone else</div>
<div id="cat-update-action-buttons">
<div id="buy-button" style="text-align: center" align="center">
<a class="btn-alert" href="http://localhost/cats" id="buy-cats-button" style="-moz-appearance: none; -webkit-appearance: none; background: #DF0000; border-radius: 3px; border: 11px solid #df0000; color: #fff; cursor: pointer; display: block; font-size: 16px; height: 16px; line-height: 16px; margin: 0 auto; text-decoration: none; transition: background-color .15s; width: 120px">Find more cats</a>
</div>
</div>
</td>
</tr>
</table>
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="cats section" id="cats" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<div class="cats-usage">
<h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0">Down the road</h2>
<p class="fine-print" style="margin: 0">Across the hall</p>
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your achievements</h3>
<table class="current-usage with-icon-left" style="border-collapse: collapse; border-spacing: 0; margin-bottom: 20px; margin-top: 20px; width: 100%">
<tr>
<th style="border: none; font-family: Times New Roman; font-size: 14px; font-weight: bold; margin: 0; padding: 0; text-align: left; vertical-align: middle; width: 50px" align="left" valign="middle"><img src="test.png"></th>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top">
<div class="top">You're currently finding about</div>
<div class="large" style="color: black; font-size: 18px; padding: 4px 0">12 cats</div>
<div class="bottom">per day</div>
</td>
</tr>
<tr><td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"> </td></tr>
<tr>
<td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"><img alt="Number of cats found" src="test.png"></td>
</tr>
</table>
</div>
<div class="summary">
<hr class="fine-print" style="border-bottom-color: #eee; border-bottom-style: solid; border-width: 0 0 1px; margin: 20px 0">
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your last cat was found two days ago.</h3>
<p class="fine-print" style="margin: 0">One type of cat is a kitten.</p>
<table class="readings" style="border-collapse: collapse; border-spacing: 0; margin: 10px 0; width: 100%">
<tr style="color: #BD236C">
<td class="left-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 5%">
<img src="test.png" style="padding-top: 10px">
</td>
<td class="center-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 60%">
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Special account <span class="nickname" style="font-size: 12px"></span> <span class="fine-print">A1</span>
</h3>
</td>
<td class="right-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 20%">
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">12.345</h3>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0"></td>
</tr>
</table>
</div>
</td>
</tr>
</table>
<div class="banner" style="margin: 0 auto 20px; padding: 10px; text-align: center; width: 640px" align="center">
<a href="http://localhost/logout" style="color: red; text-decoration: none">
<img alt="" border="0" height="177" src="http://localhost/photo1.png" width="600">
</a>
</div>
<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="tips section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<table style="border-collapse: collapse; border-spacing: 0; width: 100%">
<tr>
<td colspan="3" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 10px">How can you find more cats?</h2></td>
</tr>
<tr class="icon">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo1.png" width="40"></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo2.png" width="40"></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo3.png" width="40"></td>
</tr>
<tr class="subtitle">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Look in trash cans</h3></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Start meowing</h3></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Eat cat food</h3></td>
</tr>
<tr class="body" style="color: green">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">Some cats like to hang out in trash cans. Some cats do not.</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">Some cats are attracted to similar tones.</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">So one day your tears may smell like cat food, attracting more cats.</td>
</tr>
<tr class="image">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo1.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo2.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo3.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
</tr>
<tr class="tips-footer" style="color: green">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Cats are great.</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Find more cats.</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Do more things.</a>
</td>
</tr>
</table>
</td>
</tr>
</table>
<table class="footer-wrapper" style="margin: 0 auto 20px">
<tr>
<td class="footer" style="color: #9B9B9B; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 4em; text-align: left; width: 600px" align="left">
<h3 style="font-family: Times New Roman; font-size: 1.2; font-weight: normal; line-height: 2em; margin: 0">
<a href="http://localhost/contact" style="color: red; text-decoration: none">Contact us</a>
</h3>
<p style="margin: 0 0 1em">
cats@cats.com<br>
Monday and Friday
</p>
<p style="margin: 0 0 1em"><a href="https://github.com/soundasleep/html2text" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
</p>
<p class="message no-web-display" style="margin: 0">Having trouble seeing this email?
<a href="http://localhost/view_it_online" style="color: red; text-decoration: none">View it online</a>.
</p>
</td>
</tr>
</table>
<script async type="text/javascript" id="profiler" src="/profiler.js" data-version="1.0"></script>
</body>
</html>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,54 @@
<body>
<p>
One: <img src="one.png">
</p>
<p>
Two: <img src="two.png" alt="two">
</p>
<p>
Three: <img src="three.png" title="three">
</p>
<p>
Four: <img src="four.png" title="four" alt="four alt">
</p>
<h1>With links</h1>
<p>
One: <a href="http://localhost"><img src="one.png"></a>
</p>
<p>
Two: <a href="http://localhost"><img src="two.png" alt="two"></a>
</p>
<p>
Three: <a href="http://localhost"><img src="three.png" title="three"></a>
</p>
<p>
Four: <a href="http://localhost"><img src="four.png" title="four" alt="four alt"></a>
</p>
<h1>With links with titles</h1>
<p>
One: <a href="http://localhost" title="one link"><img src="one.png"></a>
</p>
<p>
Two: <a href="http://localhost" title="two link"><img src="two.png" alt="two"></a>
</p>
<p>
Three: <a href="http://localhost" title="three link"><img src="three.png" title="three"></a>
</p>
<p>
Four: <a href="http://localhost" title="four link"><img src="four.png" title="four" alt="four alt"></a>
</p>
</body>
</html>

View File

@ -0,0 +1,4 @@
<b>Hello &nbsnbsp; world</b>
<div class=">
Error
</div>

View File

@ -0,0 +1,24 @@
<h1>List tests</h1>
<p>
Add some lists.
</p>
<ol>
<li>one</li>
<li>two
<li>three
</ol>
<h2>An unordered list</h2>
<ul>
<li>one
<li>two</li>
<li>three</li>
</ul>
<ul>
<li>one
<li>two</li>
<li>three</li>
</ul>

View File

@ -0,0 +1,14 @@
<h1>Anchor tests</h1>
<p>
Visit http://openiaml.org or <a href="http://openiaml.org">openiaml.org</a> or <a href="http://openiaml.org">http://openiaml.org</a>.
</p>
<p>
To visit with SSL, visit https://openiaml.org or <a href="https://openiaml.org">openiaml.org</a> or <a href="https://openiaml.org">https://openiaml.org</a>.
</p>
<p>
To mail, email support@openiaml.org or mailto:support@openiaml.org
or <a href="mailto:support@openiaml.org">support@openiaml.org</a> or <a href="mailto:support@openiaml.org">mailto:support@openiaml.org</a>.
</p>

View File

@ -0,0 +1 @@
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><style><!-- /* Font Definitions */ @font-face {font-family:"Cambria Math"; panose-1:2 4 5 3 5 4 6 3 2 4;} @font-face {font-family:Calibri; panose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {margin:0cm; margin-bottom:.0001pt; font-size:11.0pt; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} a:link, span.MsoHyperlink {mso-style-priority:99; color:#0563C1; text-decoration:underline;} a:visited, span.MsoHyperlinkFollowed {mso-style-priority:99; color:#954F72; text-decoration:underline;} span.EmailStyle17 {mso-style-type:personal-compose; font-family:"Calibri",sans-serif; color:windowtext;} .MsoChpDefault {mso-style-type:export-only; font-family:"Calibri",sans-serif; mso-fareast-language:EN-US;} @page WordSection1 {size:612.0pt 792.0pt; margin:72.0pt 72.0pt 72.0pt 72.0pt;} div.WordSection1 {page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-GB link="#0563C1" vlink="#954F72"><div class=WordSection1><p class=MsoNormal>Dear html2text,<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>This is an example email that can be used to test html2text conversion of outlook / exchange emails.<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>The addition of &lt;o:p&gt; tags is very annoying!<o:p></o:p></p><p class=MsoNormal>This is a single line return<o:p></o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><b>This is bold<o:p></o:p></b></p><p class=MsoNormal><i>This is italic<o:p></o:p></i></p><p class=MsoNormal><u>This is underline<o:p></o:p></u></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Andrew<o:p></o:p></p></div></body></html>

View File

@ -0,0 +1 @@
hello &nbsp; world &amp; people &lt; &gt; &NBSP;

View File

@ -0,0 +1,17 @@
<html>
<body>
<div>
Just two divs
</div>
<div>
Hanging out
</div>
<div><div><div>Nested divs and line breaks</div></div><br></div>
<div><div>Nested divs and line breaks</div>More text<br></div>
<div><br></div>
<div>Just text</div>
<div>Just text<br></div>
<div>Just text<br><br></div>
This is the end!
</body>
</html>

View File

@ -0,0 +1,50 @@
<html>
<body>
<div>
Hello
<br>
</div>
<div>
How are you?
<br>
</div>
<p>
How are you?
<br>
</p>
<p>
How are you?
<br>
</p>
<div>
Just two divs
</div>
<div>
Hanging out
</div>
This is not the end!
<div>
How are you again?
<br>
</div>
This is the end!
<br>
Just kidding
<h1>Header 1</h1>
Some text
<hr>
Some more text
<p>Paragraph tag!</p>
<h2>Header 2</h2>
<hr>
<h3>Header 3</h3>
Some text
<h4>Header 4</h4>
<p>Paragraph tag!</p>
Final line
</body>
</html>

View File

@ -0,0 +1 @@
these spaces are non-breaking

View File

@ -0,0 +1,10 @@
Here is the code
<pre>
#include &lt;stdlib.h&gt;
#include &lt;stdio.h&gt;
int main(){
return 0;
};
</pre>

View File

@ -0,0 +1,53 @@
<html>
<title>Ignored Title</title>
<body>
<h1>Hello, World!</h1>
<table>
<thead>
<tr>
<th>Col A</th>
<th>Col B</th>
</tr>
</thead>
<tbody>
<tr>
<td>
Data A1
</td>
<td>
Data B1
</td>
</tr>
<tr>
<td>
Data A2
</td>
<td>
Data B2
</td>
</tr>
<tr>
<td>
Data A3
</td>
<td>
Data B4
</td>
</tr>
</tbody>
<tfoot>
<tr>
<td>
Total A
</td>
<td>
Total B
</td>
</tr>
</tfoot>
</table>
</body>
</html>

View File

@ -0,0 +1 @@
test one<br />test two

View File

@ -0,0 +1 @@
1<br />2<br />3<br />4<br />5 &lt; 6

View File

@ -0,0 +1,4 @@
<ul>
<li>ÅÄÖ</li>
<li>åäö</li>
</ul>

View File

@ -0,0 +1,4 @@
<ul>
<li>ÅÄÖ</li>
<li>åäö</li>
</ul>

View File

@ -0,0 +1 @@
<p>foo&zwnj;bar</p>

View File

@ -0,0 +1,5 @@
A document without any HTML open/closing tags.
---------------------------------------------------------------
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. visit foo.com - or http://www.foo.com link
An anchor which will not appear

View File

@ -0,0 +1,5 @@
A document without any HTML open/closing tags.
---------------------------------------------------------------
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)
[An anchor which will not appear]

View File

@ -0,0 +1,15 @@
Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
Even mismatched tags.
A div
Another div
A div
within a div
Another line
Yet another line
A link

View File

@ -0,0 +1,15 @@
Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
Even mismatched tags.
A div
Another div
A div
within a div
Another line
Yet another line
[A link](http://foo.com)

View File

@ -0,0 +1,44 @@
Hello
> Nest some block quotes with preformated text
>
>> Here is the code
>>
>> #include <stdlib.h>
>> #include <stdio.h>
>>
>> int main(){
>> return 0;
>> };
>>
>> Put some tags at the end
>
> Some text and tags here
>
>> First line
>>
>> Header 1
>>
>> Some text
>> ---------------------------------------------------------------
>> Some more text
>>
>> Paragraph tag!
>>
>> Header 2
>>
>> ---------------------------------------------------------------
>>
>> Header 3
>>
>> Some text
>>
>> Header 4
>>
>>> More quoted text!
>>
>> Paragraph tag!
>>
>> Final line
Some ending text just to make sure

View File

@ -0,0 +1 @@
Hello

View File

View File

@ -0,0 +1,53 @@
http://localhost/home 16 December 2015
Account 123
Hi Susan
Here is your cat report.
You have found 5 cats less than anyone else
[Find more cats](http://localhost/cats)
Down the road
Across the hall
Your achievements
You're currently finding about
12 cats
per day
[Number of cats found]
---------------------------------------------------------------
Your last cat was found two days ago.
One type of cat is a kitten.
Special account A1
12.345
http://localhost/logout
How can you find more cats?
Look in trash cans
Start meowing
Eat cat food
Some cats like to hang out in trash cans. Some cats do not. Some cats are attracted to similar tones. So one day your tears may smell like cat food, attracting more cats.
https://localhost/about https://localhost/about https://localhost/about
[Cats are great.](https://github.com/soundasleep/html2text_ruby) [Find more cats.](https://github.com/soundasleep/html2text_ruby) [Do more things.](https://github.com/soundasleep/html2text_ruby)
[Contact us](http://localhost/contact)
cats@cats.com
Monday and Friday
https://github.com/soundasleep/html2text https://github.com/soundasleep/html2text_ruby
Having trouble seeing this email? [View it online](http://localhost/view_it_online).

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
One:
Two: [two]
Three: [three]
Four: [four]
With links
One: http://localhost
Two: [two](http://localhost)
Three: [three](http://localhost)
Four: [four](http://localhost)
With links with titles
One: [one link](http://localhost)
Two: [two link](http://localhost)
Three: [three link](http://localhost)
Four: [four link](http://localhost)

View File

@ -0,0 +1 @@
Hello &nbsnbsp; world

View File

@ -0,0 +1,17 @@
List tests
Add some lists.
- one
- two
- three
An unordered list
- one
- two
- three
- one
- two
- three

View File

@ -0,0 +1,7 @@
Anchor tests
Visit http://openiaml.org or openiaml.org or http://openiaml.org.
To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org.

View File

@ -0,0 +1,12 @@
Dear html2text,
This is an example email that can be used to test html2text conversion of outlook / exchange emails.
The addition of <o:p> tags is very annoying!
This is a single line return
This is bold
This is italic
This is underline
Andrew

View File

@ -0,0 +1 @@
hello world & people < > &NBSP;

View File

@ -0,0 +1,12 @@
Just two divs
Hanging out
Nested divs and line breaks
Nested divs and line breaks
More text
Just text
Just text
Just text
This is the end!

View File

@ -0,0 +1,35 @@
Hello
How are you?
How are you?
How are you?
Just two divs
Hanging out
This is not the end!
How are you again?
This is the end!
Just kidding
Header 1
Some text
---------------------------------------------------------------
Some more text
Paragraph tag!
Header 2
---------------------------------------------------------------
Header 3
Some text
Header 4
Paragraph tag!
Final line

View File

@ -0,0 +1 @@
these spaces are non-breaking

View File

@ -0,0 +1,8 @@
Here is the code
#include <stdlib.h>
#include <stdio.h>
int main(){
return 0;
};

View File

@ -0,0 +1,7 @@
Hello, World!
Col A Col B
Data A1 Data B1
Data A2 Data B2
Data A3 Data B4
Total A Total B

View File

@ -0,0 +1,2 @@
test one
test two

View File

@ -0,0 +1,5 @@
1
2
3
4
5 < 6

View File

@ -0,0 +1,2 @@
- ÅÄÖ
- åäö

View File

@ -0,0 +1,2 @@
- ÅÄÖ
- åäö

View File

@ -0,0 +1 @@
foobar