This commit is contained in:
Peter Molnar 2016-01-07 16:26:25 +00:00
parent ec40791f5b
commit 21e20d02f0
2 changed files with 79 additions and 84 deletions

View file

@ -3,8 +3,8 @@ Contributors: cadeyrn
Donate link: https://paypal.me/petermolnar/3 Donate link: https://paypal.me/petermolnar/3
Tags: linkrot, archive, hyperlink, url Tags: linkrot, archive, hyperlink, url
Requires at least: 3.0 Requires at least: 3.0
Tested up to: 4.4 Tested up to: 4.4.1
Stable tag: 0.1 Stable tag: 0.2
License: GPLv3 License: GPLv3
License URI: http://www.gnu.org/licenses/gpl-3.0.html License URI: http://www.gnu.org/licenses/gpl-3.0.html
Required minimum PHP version: 5.3 Required minimum PHP version: 5.3
@ -36,6 +36,12 @@ Version numbering logic:
* every .B version indicates new features. * every .B version indicates new features.
* every ..C indicates bugfixes for A.B version. * every ..C indicates bugfixes for A.B version.
= 0.2 =
*2016-01-07*
* better logic
* replaced db, adding response headers, cookies and additional stuff
= 0.1 = = 0.1 =
*2015-12-11* *2015-12-11*

View file

@ -3,7 +3,7 @@
Plugin Name: wp-url2snapshot Plugin Name: wp-url2snapshot
Plugin URI: https://github.com/petermolnar/wp-url2snapshot Plugin URI: https://github.com/petermolnar/wp-url2snapshot
Description: reversible automatic short slug based on post pubdate epoch for WordPress Description: reversible automatic short slug based on post pubdate epoch for WordPress
Version: 0.1 Version: 0.2
Author: Peter Molnar <hello@petermolnar.eu> Author: Peter Molnar <hello@petermolnar.eu>
Author URI: http://petermolnar.eu/ Author URI: http://petermolnar.eu/
License: GPLv3 License: GPLv3
@ -33,6 +33,8 @@ class WP_URL2SNAPSHOT {
const timeout = 5; const timeout = 5;
const redirection = 5; const redirection = 5;
private $looping = 0;
public function __construct() { public function __construct() {
add_action( 'init', array( &$this, 'init')); add_action( 'init', array( &$this, 'init'));
@ -119,39 +121,31 @@ class WP_URL2SNAPSHOT {
$urls = static::extract_urls($content); $urls = static::extract_urls($content);
foreach ($urls as $url) { foreach ($urls as $url) {
$url = esc_url_raw($url); $url = esc_url_raw($url);
if (empty($url)) {
return false; if (empty($url))
} continue;
$domain = parse_url(get_bloginfo('url'), PHP_URL_HOST); $domain = parse_url(get_bloginfo('url'), PHP_URL_HOST);
if (preg_match("/^https?:\/\/{$domain}.*$/", $url)) {
return false; if (preg_match("/^https?:\/\/{$domain}.*$/", $url))
} continue;
elseif (preg_match('/^https?:\/\/127\.0\.0\.1.*$/', $url )) {
return false; if (preg_match('/^https?:\/\/127\.0\.0\.1.*$/', $url ))
} continue;
static::debug(" found url {$url}" ); static::debug(" found url {$url}" );
if (!$this->hash_exists($url)) { if (!$this->hash_exists($url)) {
static::debug(" not yet snapshotted, doing it now" ); static::debug(" not yet snapshotted, doing it now" );
$status = true; $r = $this->get_url($url);
// status is passed by reference !!! if (!empty($r) && is_array($r) && isset($r['headers']) && isset($r['body'])) {
$content = $this->get_url($url, $status); $this->snapshot( $url, $r );
if (($content !== false && $status === true) || $status == 'e_nottext' ) {
// all clear or not text
// not text is stored, otherwise it won't be skipped and will be retried
$s = $this->snapshot( $url, $content );
} }
elseif ( $status == 'try_archive' ) { else {
// dead content, try archive.org static::debug(" getting url failed :(" );
$acontent = $this->try_archive($url); continue;
if (!empty($acontent)) {
$s = $this->snapshot( $url, $acontent );
}
} }
} }
else { else {
@ -169,20 +163,19 @@ class WP_URL2SNAPSHOT {
*/ */
private function try_archive ( &$url ) { private function try_archive ( &$url ) {
static::debug(' trying to get archive.org version instead'); static::debug(' trying to get archive.org version');
$astatus = true;
$wstatus = true;
$aurl = 'https://archive.org/wayback/available?url=' . $url; $aurl = 'https://archive.org/wayback/available?url=' . $url;
$archive = $this->get_url($aurl, $astatus); $archive = $this->get_url($aurl);
if (($archive == false || $astatus != true) ) { if (($archive === false) )
static::debug(" archive.org version failed"); return false;
return false;
} if (!is_array($archive) || !isset($archive['headers']) || !isset($archive['body']))
return false;
try { try {
$json = json_decode($archive); $json = json_decode($archive['body']);
} }
catch (Exception $e) { catch (Exception $e) {
static::debug(" something went wrong: " . $e->getMessage()); static::debug(" something went wrong: " . $e->getMessage());
@ -217,13 +210,7 @@ class WP_URL2SNAPSHOT {
$wurl = str_replace( $json->archived_snapshots->closest->timestamp, $json->archived_snapshots->closest->timestamp . 'id_', $wurl ); $wurl = str_replace( $json->archived_snapshots->closest->timestamp, $json->archived_snapshots->closest->timestamp . 'id_', $wurl );
static::debug(" trying {$wurl}"); static::debug(" trying {$wurl}");
$wget = $this->get_url($wurl, $wstatus); return $this->get_url($wurl);
if (($wget !== false && $wstatus === true) ) {
static::debug(" success! Found archive.org version at {$wurl}");
return $wget;
}
return false;
} }
/** /**
@ -255,7 +242,8 @@ class WP_URL2SNAPSHOT {
/** /**
* *
*/ */
private static function get_url ( &$url, &$status ) { private function get_url ( &$url ) {
if (empty($url)) if (empty($url))
return false; return false;
@ -270,82 +258,79 @@ class WP_URL2SNAPSHOT {
if ( is_wp_error( $response ) ) { if ( is_wp_error( $response ) ) {
static::debug(" retrieving URL ${url} failed: " . $response->get_error_message()); static::debug(" retrieving URL ${url} failed: " . $response->get_error_message());
if ( $response->get_error_message() == 'name lookup timed out' ) {
$status = 'try_archive';
}
else {
$status = 'e_error';
}
return false; return false;
} }
if (!isset($response['headers']) || empty($response['headers']) || !isset($response['response']) || empty($response['response']) || !isset($response['response']['code']) || empty($response['response']['code'])) { if (!isset($response['headers']) || empty($response['headers']) || !isset($response['response']) || empty($response['response']) || !isset($response['response']['code']) || empty($response['response']['code'])) {
static::debug(" WHAT? No or empty headers? Get out of here."); static::debug(" WHAT? No or empty headers? Get out of here.");
$status = 'e_noresponseheaders';
return false; return false;
} }
if (!isset($response['headers']['content-type']) || empty($response['headers']['content-type'])) { if (!isset($response['headers']['content-type']) || empty($response['headers']['content-type'])) {
static::debug(" Empty content type, I don't want this link"); static::debug(" Empty content type, I don't want this link");
$status = 'e_nomime';
return false; return false;
} }
if ($response['response']['code'] != 200) { // 400s: client error. Yeah, sure.
static::debug(" Response was {$response['response']['code']}."); if ($response['response']['code'] < 500 && $response['response']['code'] >= 400 ) {
if ( $response['response']['code'] == 404 ) { return $this->try_archive($url);
$status = 'try_archive'; }
// try next time
elseif ($response['response']['code'] >= 500 ) {
return false;
}
// redirects, follow redirect, but keep counting to avoid infinity
elseif ($response['response']['code'] < 400 && $response['response']['code'] >= 300 && isset($response['headers']['location']) && !empty($response['headers']['location'])) {
if ($this->looping < 6) {
$this->looping = $this->looping + 1;
return $this->get_url($response['headers']['location']);
} }
else { else {
$status = 'e_not200'; $this->looping = 0;
} return false;
return $response['response']['code'];
}
$mime_ok = false;
$mimes = array ('text/', 'application/json', 'application/javascript');
foreach ( $mimes as $mime ) {
if (stristr( $response['headers']['content-type'], $mime)) {
$mime_ok = true;
} }
} }
elseif ($response['response']['code'] == 200) {
$mime_ok = false;
$mimes = array ('text/', 'application/json', 'application/javascript');
foreach ( $mimes as $mime ) {
if (stristr( $response['headers']['content-type'], $mime)) {
$mime_ok = true;
}
}
if (!$mime_ok) { if (!$mime_ok) {
static::debug(" {$response['headers']['content-type']} is probably not text"); static::debug(" {$response['headers']['content-type']} is not text, we don't want it.");
$status = 'e_nottext'; return true;
}
}
else {
static::debug(" Response was {$response['headers']['code']}. This is not yet handled.");
return false; return false;
} }
$contents = wp_remote_retrieve_body( $response ); $this->looping = 0;
return $response;
if (is_wp_error($contents)) {
static::debug(" retrieving contents of URL ${url} failed: " . $response->get_error_message());
$status = 'e_content';
return false;
}
return $contents;
} }
/** /**
* *
*/ */
private function snapshot ( &$url, &$content ) { private function snapshot ( &$url, &$r ) {
global $wpdb; global $wpdb;
$dbname = "{$wpdb->prefix}urlsnapshots"; $dbname = "{$wpdb->prefix}urlsnapshots";
$r = false; $req = false;
$q = $wpdb->prepare( "INSERT INTO `{$dbname}` (`url_hash`,`url_date`,`url_url`,`url_content`) VALUES (UNHEX(SHA1('{$url}')), NOW(), '%s', '%s' );", $url, $content ); $q = $wpdb->prepare( "INSERT INTO `{$dbname}` (`url_hash`,`url_date`,`url_url`, `url_response`,`url_headers`, `url_cookies`,`url_body`) VALUES (UNHEX(SHA1('{$url}')), NOW(), '%s', '%s', '%s', '%s', '%s' );", $url, json_encode($r['response']), json_encode($r['headers']), json_encode($r['cookies']), $r['body'] );
try { try {
$r = $wpdb->query( $q ); $req = $wpdb->query( $q );
} }
catch (Exception $e) { catch (Exception $e) {
static::debug('Something went wrong: ' . $e->getMessage()); static::debug('Something went wrong: ' . $e->getMessage());
} }
return $r; return $req;
} }
/** /**
@ -372,7 +357,11 @@ class WP_URL2SNAPSHOT {
`url_hash` binary(20), `url_hash` binary(20),
`url_date` datetime NOT NULL DEFAULT NOW(), `url_date` datetime NOT NULL DEFAULT NOW(),
`url_url` text COLLATE {$wpdb->collate}, `url_url` text COLLATE {$wpdb->collate},
`url_content` longtext COLLATE {$wpdb->collate}, `url_response` text COLLATE {$wpdb->collate},
`url_headers` text COLLATE {$wpdb->collate},
`url_cookies` text COLLATE {$wpdb->collate},
`url_body` longtext COLLATE {$wpdb->collate},
PRIMARY KEY (`url_hash`) PRIMARY KEY (`url_hash`)
) {$charset_collate};"; ) {$charset_collate};";