selfoss_html_feed/extractor.php

<?php

namespace spouts\html; // selfoss/src/spouts/html/extractor.php

use spouts\Item;
use helpers\WebClient;
use GuzzleHttp\Psr7\Uri;
use Monolog\Logger;
use DOMDocument;
use DOMXpath;
use DOMNode;
use DOMNodeList;

/**
 * Selfoss spout plugin for fetching arbitrary HTML pages as feeds
 *
 * Given corresponding XPath expressions, feed items are built from the parsed DOM.
 *
 * @author https://www.hackitu.de/
 */
class extractor extends \spouts\spout {
    /** @var string name of source */
    public $name = 'Extracted HTML';

    /** @var string description of this source type */
    public $description = 'Extract article lists from HTML pages.';

    /** @var ?string */
    private $htmlUrl = null;

    /** @var ?string */
    private $iconUrl = null;

    /** @var ?string */
    private $title = null;

    /** @var array[] */
    private $items = [];

    /** @var array configurable parameters */
    public $params = [
        'url' => [
            'title' => 'URL',
            'type' => 'url',
            'default' => '',
            'required' => true,
            'validation' => ['notempty'],
        ],
        'article_query' => [
            'title' => 'Main Article Separator XPath',
            'type' => 'text',
            'default' => '',
            'required' => true,
            'validation' => ['notempty'],
        ],
        'link_query' => [
            'title' => 'Link XPath',
            'type' => 'text',
            'default' => '',
            'required' => true,
            'validation' => ['notempty'],
        ],
        'title_query' => [
            'title' => 'Title XPath',
            'type' => 'text',
            'default' => '',
            'required' => true,
            'validation' => ['notempty'],
        ],
        'sub_title_query' => [
            'title' => 'Sub-Title XPath (optional)',
            'type' => 'text',
            'default' => '',
            'required' => false,
            'validation' => [''],
        ],
        'content_query' => [
            'title' => 'Content XPath (optional)',
            'type' => 'text',
            'default' => '',
            'required' => false,
            'validation' => [''],
        ],
        'thumbnail_query' => [
            'title' => 'Thumbnail XPath (optional)',
            'type' => 'text',
            'default' => '',
            'required' => false,
            'validation' => [''],
        ],
    ];

    /** @var WebClient */
    private $webClient;

    /** @var Logger */
    private $logger;

    public function __construct(WebClient $webClient, Logger $logger) {
        $this->webClient = $webClient;
        $this->logger = $logger;
    }

    /** @return ?string html url as configured */
    public function getHtmlUrl() {
        return $this->htmlUrl;
    }

    /** @return ?string favicon url inferred from fetched url */
    public function getIcon() {
        // TODO: could use imageHelper->fetchFavicon
        if (is_null($this->iconUrl)) return null;
        return self::makeAbsolute($this->htmlUrl, $this->iconUrl);
    }

    /** @return ?string title of the source as inferred from fetched url */
    public function getTitle() {
        return $this->title;
    }

    /** @return void fetch url and parse via xpath */
    public function load(array $params) {
        $this->htmlUrl = $params['url'];
        $data = $this->webClient->request($this->htmlUrl);

        $doc = new DOMDocument();
        if (!$doc->loadHTML($data, LIBXML_NONET|LIBXML_NOERROR|LIBXML_NOWARNING)) {
            $this->logger->warning('Cannot parse document: ' . $this->htmlUrl);
        }

        $xpath = new DOMXpath($doc);
        $this->title = $this->findNodeValue($xpath, '/html/head/title');
        $this->iconUrl = $this->findNodeValue($xpath, '/html/head/link[@rel="shortcut icon" or @rel="icon"]/@href');
        $this->items = iterator_to_array($this->parseItems($xpath, $params), false);
    }

    /** @return \Iterator<array> parsed raw items */
    private function parseItems(DOMXpath $xpath, array $params) {
        $articles = $xpath->query($params['article_query']);
        if ($articles === false) {
            $this->logger->warning('Found no articles');
            return;
        }

        foreach ($articles as $article) {
            yield [
                'link' => $this->findNodeValue($xpath, $params['link_query'], $article),
                'title' => $this->findNodeValue($xpath, $params['title_query'], $article),
                'sub_title' => $this->findNodeValue($xpath, $params['sub_title_query'], $article),
                'content' => $this->findNodeValue($xpath, $params['content_query'], $article),
                'thumbnail' => $this->findNodeValue($xpath, $params['thumbnail_query'], $article),
            ];
        }
    }

    /** @return \Iterator<Item<null>> list of item datastructures from raw parsing results */
    public function getItems() {
        foreach ($this->items as $item) {
            if (is_null($item['link'])) { // required
                continue;
            }
            yield new Item(
                md5($item['link']),
                self::makeTitle($item['title'], $item['sub_title']),
                $item['content'] ?: '', // TODO: fulltext subclass that fetches link
                self::makeAbsolute($this->getHtmlUrl(), $item['thumbnail']),
                null,
                self::makeAbsolute($this->getHtmlUrl(), $item['link']), // XXX: also gets htmLawed
                null,
                null,
            );
        }
    }

    /** @return ?string extract textual content from xpath query result */
    private function selectNodeValue(DOMNodeList $nodes) {
        if ($nodes->length == 0) {
            return null;
        }

        $node = $nodes[0]; // first one wins
        if ($node->nodeType == XML_ELEMENT_NODE) {
            return trim($node->textContent) ?: null;
        } else if ($node->nodeType == XML_ATTRIBUTE_NODE) {
            return trim($node->value) ?: null;
        } else if ($node->nodeType == XML_TEXT_NODE) {
            return trim($node->wholeText) ?: null;
        } else {
            $this->logger->warning('Unsupported DOM node type: ' . $node->nodeType);
            return null;
        }
    }

    /** @return ?string execute xpath query for textual content */
    private function findNodeValue(DOMXpath $xpath, ?string $query, ?DOMNode $root = null) {
        if (!$query) {
            return null;
        }
        $nodes = $xpath->query($query, $root);

        if ($nodes === false) {
            $this->logger->warning('Invalid XPath query: ' . $query);
            return null;
        }
        return $this->selectNodeValue($nodes);
    }

    /** @return ?string try to convert relative links to absolute ones using the overall base */
    private static function makeAbsolute(?string $base, ?string $url) {
        if (is_null($base)) return null; // XXX: would be ok if actual url is absolute
        if (is_null($url)) return null;

        $base_uri = new Uri($base);
        $url_uri = new Uri($url);
        if (Uri::isAbsolute($url_uri)) {
            return $url;
        } else if (Uri::isAbsolutePathReference($url_uri)) {
            return (string)$base_uri->withPath($url_uri->getPath())->withQuery($url_uri->getQuery())->withFragment($url_uri->getFragment());
        } else if (Uri::isRelativePathReference($url_uri)) {
            return (string)$base_uri->withPath(rtrim($base_uri->getPath(), '/') . '/' . $url_uri->getPath())->withQuery($url_uri->getQuery())->withFragment($url_uri->getFragment());
        }

        return null;
    }

    /** @return string concatenate title and sub-title in a readable way */
    private static function makeTitle(?string $title, ?string $sub_title) {
        if (is_null($title)) {
            return $sub_title ?: '';
        } else if (is_null($sub_title)) {
            return $title ?: '';
        } else {
            return $title . ': ' . $sub_title;
        }
    }

    /** @return void clear cache of parsed items */
    public function destroy() {
        unset($this->items);
        $this->items = [];
    }
}