laipower/wp-content/plugins/static-html-output-plugin/plugin/WP2Static/HTMLProcessor.php

2 lines
20 KiB
PHP
Raw Normal View History

2020-04-07 13:03:04 +00:00
<?php
class HTMLProcessor extends WP2Static { public function __construct() { $this->loadSettings( array( 'github', 'wpenv', 'processing', 'advanced', ) ); $this->processed_urls = array(); } public function processHTML( $html_document, $page_url ) { if ( $html_document == '' ) { return false; } $this->xml_doc = new DOMDocument(); $this->destination_protocol = $this->getTargetSiteProtocol( $this->settings['baseUrl'] ); $this->placeholder_url = $this->destination_protocol . 'PLACEHOLDER.wpsho/'; $this->raw_html = $this->rewriteSiteURLsToPlaceholder( $html_document ); $this->base_tag_exists = false; require_once dirname( __FILE__ ) . '/../URL2/URL2.php'; $this->page_url = new Net_url2( $page_url ); $this->detectIfURLsShouldBeHarvested(); $this->discovered_urls = array(); libxml_use_internal_errors( true ); $this->xml_doc->loadHTML( $this->raw_html ); libxml_use_internal_errors( false ); $elements = iterator_to_array( $this->xml_doc->getElementsByTagName( '*' ) ); foreach ( $elements as $element ) { switch ( $element->tagName ) { case 'meta': $this->processMeta( $element ); break; case 'a': $this->processAnchor( $element ); break; case 'img': $this->processImage( $element ); $this->processImageSrcSet( $element ); break; case 'head': $this->processHead( $element ); break; case 'link': $this->processLink( $element ); break; case 'script': $this->processScript( $element ); break; } } if ( $this->base_tag_exists ) { $base_element = $this->xml_doc->getElementsByTagName( 'base' )->item( 0 ); if ( $this->shouldCreateBaseHREF() ) { $base_element->setAttribute( 'href', $this->settings['baseHREF'] ); } else { $base_element->parentNode->removeChild( $base_element ); } } elseif ( $this->shouldCreateBaseHREF() ) { $base_element = $this->xml_doc->createElement( 'base' ); $base_element->setAttribute( 'href', $this->settings['baseHREF'] ); $head_element = $this->xml_doc->getElementsByTagName( 'head' )->item( 0 ); if ( $head_element ) { $first_head_child = $head_element->firstChild; $head_element->insertBefore( $base_element, $first_head_child ); } else { require_once dirname( __FILE__ ) . '/../WP2Static/WsLog.php'; WsLog::l( 'WARNING: no valid head elemnent to attach base to: ' . $this->page_url ); } } $this->stripHTMLComments(); $this->writeDiscoveredURLs(); return true; } public function detectIfURLsShouldBeHarvested() { if ( ! defined( 'WP_CLI' ) ) { $this->harvest_new_urls = ( $_POST['ajax_action'] === 'crawl_site' ); } else { if ( defined( 'CRAWLING_DISCOVERED' ) ) { return; } else { $this->harvest_new_urls = true; } } } public function processLink( $element ) { $this->normalizeURL( $element, 'href' ); $this->removeQueryStringFromInternalLink( $element ); $this->addDiscoveredURL( $element->getAttribute( 'href' ) ); $this->rewriteWPPaths( $element ); $this->rewriteBaseURL( $element ); $this->convertToRelativeURL( $element ); $this->convertToOfflineURL( $element ); if ( isset( $this->settings['removeWPLinks'] ) ) { $relative_links_to_rm = array( 'shortlink', 'canonical', 'pingback', 'alternate', 'EditURI', 'wlwmanifest', 'index', 'profile', 'prev', 'next', 'wlwmanifest', ); $link_rel = $element->getAttribute( 'rel' ); if ( in_array( $link_rel, $relative_links_to_rm ) ) { $element->parentNode->removeChild( $element ); } elseif ( strpos( $link_rel, '.w.org' ) !== false ) { $element->parentNode->removeChild( $element ); } } } public function isValidURL( $url ) { $url = trim( $url ); if ( $url == '' ) { return false; } if ( strpos( $url, '.php' ) !== false ) { return false; } if ( strpos( $url, ' ' ) !== false ) { return false; } if ( $url[0] == '#' ) { return false; } return true; } public function addDiscoveredURL( $url ) { $url = strtok( $url, '#' ); $url = strtok( $url, '?' ); if ( in_array( $url, $this->processed_urls ) ) { return; } if ( trim( $url ) === '' ) { return; } $this->processed_urls[] = $url; if ( isset( $this->harvest_new_urls ) ) { if ( ! $this->isValidURL( $url ) ) { return; } if ( $this->isInternalLink( $url ) ) { $discovered_url_without_site_url = str_replace( rtrim( $this->placeholder_url, '/' ), '', $url ); $this->logActio