<?php
namespace spouts\html; // selfoss/src/spouts/html/extractor.php
use spouts\Item;
use helpers\WebClient;
use GuzzleHttp\Psr7\Uri;
use Monolog\Logger;
use DOMDocument;
use DOMXpath;
use DOMNode;
use DOMNodeList;
/**
* Selfoss spout plugin for fetching arbitrary HTML pages as feeds
*
* Given corresponding XPath expressions, feed items are built from the parsed DOM.
*
* @author https://www.hackitu.de/
*/
class extractor extends \spouts\spout {
/** @var string name of source */
public $name = 'Extracted HTML';
/** @var string description of this source type */
public $description = 'Extract article lists from HTML pages.';
/** @var ?string */
private $htmlUrl = null;
/** @var ?string */
private $iconUrl = null;
/** @var ?string */
private $title = null;
/** @var array[] */
private $items = [];
/** @var array configurable parameters */
public $params = [
'url' => [
'title' => 'URL',
'type' => 'url',
'default' => '',
'required' => true,
'validation' => ['notempty'],
],
'article_query' => [
'title' => 'Main Article Separator XPath',
'type' => 'text',
'default' => '',
'required' => true,
'validation' => ['notempty'],
],
'link_query' => [
'title' => 'Link XPath',
'type' => 'text',
'default' => '',
'required' => true,
'validation' => ['notempty'],
],
'title_query' => [
'title' => 'Title XPath',
'type' => 'text',
'default' => '',
'required' => true,
'validation' => ['notempty'],
],
'sub_title_query' => [
'title' => 'Sub-Title XPath (optional)',
'type' => 'text',
'default' => '',
'required' => false,
'validation' => [''],
],
'content_query' => [
'title' => 'Content XPath (optional)',
'type' => 'text',
'default' => '',
'required' => false,
'validation' => [''],
],
'thumbnail_query' => [
'title' => 'Thumbnail XPath (optional)',
'type' => 'text',
'default' => '',
'required' => false,
'validation' => [''],
],
];
/** @var WebClient */
private $webClient;
/** @var Logger */
private $logger;
public function __construct(WebClient $webClient, Logger $logger) {
$this->webClient = $webClient;
$this->logger = $logger;
}
/** @return ?string html url as configured */
public function getHtmlUrl() {
return $this->htmlUrl;
}
/** @return ?string favicon url inferred from fetched url */
public function getIcon() {
// TODO: could use imageHelper->fetchFavicon
if (is_null($this->iconUrl)) return null;
return self::makeAbsolute($this->htmlUrl, $this->iconUrl);
}
/** @return ?string title of the source as inferred from fetched url */
public function getTitle() {
return $this->title;
}
/** @return void fetch url and parse via xpath */
public function load(array $params) {
$this->htmlUrl = $params['url'];
$data = $this->webClient->request($this->htmlUrl);
$doc = new DOMDocument();
if (!$doc->loadHTML($data, LIBXML_NONET|LIBXML_NOERROR|LIBXML_NOWARNING)) {
$this->logger->warning('Cannot parse document: ' . $this->htmlUrl);
}
$xpath = new DOMXpath($doc);
$this->title = $this->findNodeValue($xpath, '/html/head/title');
$this->iconUrl = $this->findNodeValue($xpath, '/html/head/link[@rel="shortcut icon" or @rel="icon"]/@href');
$this->items = iterator_to_array($this->parseItems($xpath, $params), false);
}
/** @return \Iterator<array> parsed raw items */
private function parseItems(DOMXpath $xpath, array $params) {
$articles = $xpath->query($params['article_query']);
if ($articles === false) {
$this->logger->warning('Found no articles');
return;
}
foreach ($articles as $article) {
yield [
'link' => $this->findNodeValue($xpath, $params['link_query'], $article),
'title' => $this->findNodeValue($xpath, $params['title_query'], $article),
'sub_title' => $this->findNodeValue($xpath, $params['sub_title_query'], $article),
'content' => $this->findNodeValue($xpath, $params['content_query'], $article),
'thumbnail' => $this->findNodeValue($xpath, $params['thumbnail_query'], $article),
];
}
}
/** @return \Iterator<Item<null>> list of item datastructures from raw parsing results */
public function getItems() {
foreach ($this->items as $item) {
if (is_null($item['link'])) { // required
continue;
}
yield new Item(
md5($item['link']),
self::makeTitle($item['title'], $item['sub_title']),
$item['content'] ?: '', // TODO: fulltext subclass that fetches link
self::makeAbsolute($this->getHtmlUrl(), $item['thumbnail']),
null,
self::makeAbsolute($this->getHtmlUrl(), $item['link']), // XXX: also gets htmLawed
null,
null,
);
}
}
/** @return ?string extract textual content from xpath query result */
private function selectNodeValue(DOMNodeList $nodes) {
if ($nodes->length == 0) {
return null;
}
$node = $nodes[0]; // first one wins
if ($node->nodeType == XML_ELEMENT_NODE) {
return trim($node->textContent) ?: null;
} else if ($node->nodeType == XML_ATTRIBUTE_NODE) {
return trim($node->value) ?: null;
} else if ($node->nodeType == XML_TEXT_NODE) {
return trim($node->wholeText) ?: null;
} else {
$this->logger->warning('Unsupported DOM node type: ' . $node->nodeType);
return null;
}
}
/** @return ?string execute xpath query for textual content */
private function findNodeValue(DOMXpath $xpath, ?string $query, ?DOMNode $root = null) {
if (!$query) {
return null;
}
$nodes = $xpath->query($query, $root);
if ($nodes === false) {
$this->logger->warning('Invalid XPath query: ' . $query);
return null;
}
return $this->selectNodeValue($nodes);
}
/** @return ?string try to convert relative links to absolute ones using the overall base */
private static function makeAbsolute(?string $base, ?string $url) {
if (is_null($base)) return null; // XXX: would be ok if actual url is absolute
if (is_null($url)) return null;
$base_uri = new Uri($base);
$url_uri = new Uri($url);
if (Uri::isAbsolute($url_uri)) {
return $url;
} else if (Uri::isAbsolutePathReference($url_uri)) {
return (string)$base_uri->withPath($url_uri->getPath())->withQuery($url_uri->getQuery())->withFragment($url_uri->getFragment());
} else if (Uri::isRelativePathReference($url_uri)) {
return (string)$base_uri->withPath(rtrim($base_uri->getPath(), '/') . '/' . $url_uri->getPath())->withQuery($url_uri->getQuery())->withFragment($url_uri->getFragment());
}
return null;
}
/** @return string concatenate title and sub-title in a readable way */
private static function makeTitle(?string $title, ?string $sub_title) {
if (is_null($title)) {
return $sub_title ?: '';
} else if (is_null($sub_title)) {
return $title ?: '';
} else {
return $title . ': ' . $sub_title;
}
}
/** @return void clear cache of parsed items */
public function destroy() {
unset($this->items);
$this->items = [];
}
}