0.2
This commit is contained in:
parent
ec40791f5b
commit
21e20d02f0
2 changed files with 79 additions and 84 deletions
10
readme.txt
10
readme.txt
|
@ -3,8 +3,8 @@ Contributors: cadeyrn
|
|||
Donate link: https://paypal.me/petermolnar/3
|
||||
Tags: linkrot, archive, hyperlink, url
|
||||
Requires at least: 3.0
|
||||
Tested up to: 4.4
|
||||
Stable tag: 0.1
|
||||
Tested up to: 4.4.1
|
||||
Stable tag: 0.2
|
||||
License: GPLv3
|
||||
License URI: http://www.gnu.org/licenses/gpl-3.0.html
|
||||
Required minimum PHP version: 5.3
|
||||
|
@ -36,6 +36,12 @@ Version numbering logic:
|
|||
* every .B version indicates new features.
|
||||
* every ..C indicates bugfixes for A.B version.
|
||||
|
||||
= 0.2 =
|
||||
*2016-01-07*
|
||||
|
||||
* better logic
|
||||
* replaced db, adding response headers, cookies and additional stuff
|
||||
|
||||
= 0.1 =
|
||||
*2015-12-11*
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
Plugin Name: wp-url2snapshot
|
||||
Plugin URI: https://github.com/petermolnar/wp-url2snapshot
|
||||
Description: reversible automatic short slug based on post pubdate epoch for WordPress
|
||||
Version: 0.1
|
||||
Version: 0.2
|
||||
Author: Peter Molnar <hello@petermolnar.eu>
|
||||
Author URI: http://petermolnar.eu/
|
||||
License: GPLv3
|
||||
|
@ -33,6 +33,8 @@ class WP_URL2SNAPSHOT {
|
|||
const timeout = 5;
|
||||
const redirection = 5;
|
||||
|
||||
private $looping = 0;
|
||||
|
||||
public function __construct() {
|
||||
|
||||
add_action( 'init', array( &$this, 'init'));
|
||||
|
@ -119,39 +121,31 @@ class WP_URL2SNAPSHOT {
|
|||
$urls = static::extract_urls($content);
|
||||
foreach ($urls as $url) {
|
||||
$url = esc_url_raw($url);
|
||||
if (empty($url)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (empty($url))
|
||||
continue;
|
||||
|
||||
$domain = parse_url(get_bloginfo('url'), PHP_URL_HOST);
|
||||
if (preg_match("/^https?:\/\/{$domain}.*$/", $url)) {
|
||||
return false;
|
||||
}
|
||||
elseif (preg_match('/^https?:\/\/127\.0\.0\.1.*$/', $url )) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (preg_match("/^https?:\/\/{$domain}.*$/", $url))
|
||||
continue;
|
||||
|
||||
if (preg_match('/^https?:\/\/127\.0\.0\.1.*$/', $url ))
|
||||
continue;
|
||||
|
||||
static::debug(" found url {$url}" );
|
||||
|
||||
if (!$this->hash_exists($url)) {
|
||||
|
||||
static::debug(" not yet snapshotted, doing it now" );
|
||||
$status = true;
|
||||
$r = $this->get_url($url);
|
||||
|
||||
// status is passed by reference !!!
|
||||
$content = $this->get_url($url, $status);
|
||||
|
||||
if (($content !== false && $status === true) || $status == 'e_nottext' ) {
|
||||
// all clear or not text
|
||||
// not text is stored, otherwise it won't be skipped and will be retried
|
||||
$s = $this->snapshot( $url, $content );
|
||||
}
|
||||
elseif ( $status == 'try_archive' ) {
|
||||
// dead content, try archive.org
|
||||
$acontent = $this->try_archive($url);
|
||||
if (!empty($acontent)) {
|
||||
$s = $this->snapshot( $url, $acontent );
|
||||
if (!empty($r) && is_array($r) && isset($r['headers']) && isset($r['body'])) {
|
||||
$this->snapshot( $url, $r );
|
||||
}
|
||||
else {
|
||||
static::debug(" getting url failed :(" );
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -169,20 +163,19 @@ class WP_URL2SNAPSHOT {
|
|||
*/
|
||||
private function try_archive ( &$url ) {
|
||||
|
||||
static::debug(' trying to get archive.org version instead');
|
||||
$astatus = true;
|
||||
$wstatus = true;
|
||||
static::debug(' trying to get archive.org version');
|
||||
$aurl = 'https://archive.org/wayback/available?url=' . $url;
|
||||
|
||||
$archive = $this->get_url($aurl, $astatus);
|
||||
$archive = $this->get_url($aurl);
|
||||
|
||||
if (($archive == false || $astatus != true) ) {
|
||||
static::debug(" archive.org version failed");
|
||||
if (($archive === false) )
|
||||
return false;
|
||||
|
||||
if (!is_array($archive) || !isset($archive['headers']) || !isset($archive['body']))
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
$json = json_decode($archive);
|
||||
$json = json_decode($archive['body']);
|
||||
}
|
||||
catch (Exception $e) {
|
||||
static::debug(" something went wrong: " . $e->getMessage());
|
||||
|
@ -217,13 +210,7 @@ class WP_URL2SNAPSHOT {
|
|||
$wurl = str_replace( $json->archived_snapshots->closest->timestamp, $json->archived_snapshots->closest->timestamp . 'id_', $wurl );
|
||||
static::debug(" trying {$wurl}");
|
||||
|
||||
$wget = $this->get_url($wurl, $wstatus);
|
||||
if (($wget !== false && $wstatus === true) ) {
|
||||
static::debug(" success! Found archive.org version at {$wurl}");
|
||||
return $wget;
|
||||
}
|
||||
|
||||
return false;
|
||||
return $this->get_url($wurl);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -255,7 +242,8 @@ class WP_URL2SNAPSHOT {
|
|||
/**
|
||||
*
|
||||
*/
|
||||
private static function get_url ( &$url, &$status ) {
|
||||
private function get_url ( &$url ) {
|
||||
|
||||
if (empty($url))
|
||||
return false;
|
||||
|
||||
|
@ -270,39 +258,39 @@ class WP_URL2SNAPSHOT {
|
|||
|
||||
if ( is_wp_error( $response ) ) {
|
||||
static::debug(" retrieving URL ${url} failed: " . $response->get_error_message());
|
||||
|
||||
if ( $response->get_error_message() == 'name lookup timed out' ) {
|
||||
$status = 'try_archive';
|
||||
}
|
||||
else {
|
||||
$status = 'e_error';
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!isset($response['headers']) || empty($response['headers']) || !isset($response['response']) || empty($response['response']) || !isset($response['response']['code']) || empty($response['response']['code'])) {
|
||||
static::debug(" WHAT? No or empty headers? Get out of here.");
|
||||
$status = 'e_noresponseheaders';
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!isset($response['headers']['content-type']) || empty($response['headers']['content-type'])) {
|
||||
static::debug(" Empty content type, I don't want this link");
|
||||
$status = 'e_nomime';
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($response['response']['code'] != 200) {
|
||||
static::debug(" Response was {$response['response']['code']}.");
|
||||
if ( $response['response']['code'] == 404 ) {
|
||||
$status = 'try_archive';
|
||||
// 400s: client error. Yeah, sure.
|
||||
if ($response['response']['code'] < 500 && $response['response']['code'] >= 400 ) {
|
||||
return $this->try_archive($url);
|
||||
}
|
||||
// try next time
|
||||
elseif ($response['response']['code'] >= 500 ) {
|
||||
return false;
|
||||
}
|
||||
// redirects, follow redirect, but keep counting to avoid infinity
|
||||
elseif ($response['response']['code'] < 400 && $response['response']['code'] >= 300 && isset($response['headers']['location']) && !empty($response['headers']['location'])) {
|
||||
if ($this->looping < 6) {
|
||||
$this->looping = $this->looping + 1;
|
||||
return $this->get_url($response['headers']['location']);
|
||||
}
|
||||
else {
|
||||
$status = 'e_not200';
|
||||
$this->looping = 0;
|
||||
return false;
|
||||
}
|
||||
return $response['response']['code'];
|
||||
}
|
||||
|
||||
elseif ($response['response']['code'] == 200) {
|
||||
$mime_ok = false;
|
||||
$mimes = array ('text/', 'application/json', 'application/javascript');
|
||||
foreach ( $mimes as $mime ) {
|
||||
|
@ -312,40 +300,37 @@ class WP_URL2SNAPSHOT {
|
|||
}
|
||||
|
||||
if (!$mime_ok) {
|
||||
static::debug(" {$response['headers']['content-type']} is probably not text");
|
||||
$status = 'e_nottext';
|
||||
static::debug(" {$response['headers']['content-type']} is not text, we don't want it.");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
static::debug(" Response was {$response['headers']['code']}. This is not yet handled.");
|
||||
return false;
|
||||
}
|
||||
|
||||
$contents = wp_remote_retrieve_body( $response );
|
||||
|
||||
if (is_wp_error($contents)) {
|
||||
static::debug(" retrieving contents of URL ${url} failed: " . $response->get_error_message());
|
||||
$status = 'e_content';
|
||||
return false;
|
||||
}
|
||||
|
||||
return $contents;
|
||||
$this->looping = 0;
|
||||
return $response;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private function snapshot ( &$url, &$content ) {
|
||||
private function snapshot ( &$url, &$r ) {
|
||||
global $wpdb;
|
||||
$dbname = "{$wpdb->prefix}urlsnapshots";
|
||||
$r = false;
|
||||
$req = false;
|
||||
|
||||
$q = $wpdb->prepare( "INSERT INTO `{$dbname}` (`url_hash`,`url_date`,`url_url`,`url_content`) VALUES (UNHEX(SHA1('{$url}')), NOW(), '%s', '%s' );", $url, $content );
|
||||
$q = $wpdb->prepare( "INSERT INTO `{$dbname}` (`url_hash`,`url_date`,`url_url`, `url_response`,`url_headers`, `url_cookies`,`url_body`) VALUES (UNHEX(SHA1('{$url}')), NOW(), '%s', '%s', '%s', '%s', '%s' );", $url, json_encode($r['response']), json_encode($r['headers']), json_encode($r['cookies']), $r['body'] );
|
||||
|
||||
try {
|
||||
$r = $wpdb->query( $q );
|
||||
$req = $wpdb->query( $q );
|
||||
}
|
||||
catch (Exception $e) {
|
||||
static::debug('Something went wrong: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
return $r;
|
||||
return $req;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -372,7 +357,11 @@ class WP_URL2SNAPSHOT {
|
|||
`url_hash` binary(20),
|
||||
`url_date` datetime NOT NULL DEFAULT NOW(),
|
||||
`url_url` text COLLATE {$wpdb->collate},
|
||||
`url_content` longtext COLLATE {$wpdb->collate},
|
||||
`url_response` text COLLATE {$wpdb->collate},
|
||||
`url_headers` text COLLATE {$wpdb->collate},
|
||||
`url_cookies` text COLLATE {$wpdb->collate},
|
||||
`url_body` longtext COLLATE {$wpdb->collate},
|
||||
|
||||
PRIMARY KEY (`url_hash`)
|
||||
) {$charset_collate};";
|
||||
|
||||
|
|
Loading…
Reference in a new issue