Version: 1
Type: Full Script
Category: Databases
License: GNU General Public License
Description: IMDB-Scraper lists a directory and runs the filename through IMDB, parses the content, and writes it to a file to be inserted into MySQL. Also it downloads the large cover poster.
Ok, so what this does is ..
(1) Opens a directory, lists all the movie-type files: ‘mkv, mp4, mpg, avi’
(2) Submits a query to Google: ‘I’m feeling lucky’
(3) Downloads all the information, parses it, and stores it in a ‘data’ file
(4) Copies the large poster-image to a ‘temp’ folder
I’m sure you can figure it out
Note: The last character in the last line of data.txt will be a comma (,) that needs to be a semicolon (;).
You can write something into this to add the semicolon or just change it manually.
This script takes the tedious work out of storing your movie collection to a database, but I prefer to do the insertion to the database manually to insure fewer errors.
I didn’t write the IMDB-Scraper (don’t really know who did). I modified some parts of it, and made it automatic (90%, with the exception of the insert and the semicolon).
If anybody wishes to make those corrections :>
Anyways…
Create a database: moviedb >Create a table: movies >Then create new fields: <-> movie_id ::: Random 9-10 digit id <-> movie_fn ::: name of file located on the harddrive <-> movie_path ::: Path where the file is located on the harddrive <-> movie_title ::: Film Title <-> movie_date ::: Release date: Day, Month, Year <-> movie_actors ::: Cast members <-> movie_about ::: Short description.. <-> movie_length ::: Length in minutes.. <-> movie_size ::: Size of the file on the disk.. <-> movie_rating ::: MPAA rating.. <-> movie_plot ::: This is the Synopsis field.. <-> movie_type ::: This is your Genres <-> movie_when ::: This is for 'recently' added stuff.. The more recent, the closer to the top it is.. <-> movie_quality ::: This is a rough 'mkv/mp4' vs avi/mpg to lable as HD or SD -- not accurate unless you know your avis are SD and MKV's are HD, so on.. #!/usr/local/bin/php -q <?php // movie location $path = "/movies/movies"; $cover_path = "$path/temp"; global $poster; $dbhost = 'localhost'; $dbuser = 'YOUR_SQL_USERNAME'; $dbpass = 'YOUR_SQL_PASSWORD'; $conn = mysql_connect($dbhost, $dbuser, $dbpass) or die("WTF I CANT CONNECT!!n"); $dbname = 'moviedb'; mysql_select_db($dbname); if (file_exists("$path/data.txt")) { sleep(1); exec("rm $path/data.txt n"); $glob = "$cover_path/"; if (glob($glob . '*.jpg')) { exec("rm $cover_path/*.jpg"); } } $dir_handle = @opendir($path) or die("Unable to open $path"); $data_f = "$path/data.txt"; $list = fopen("$data_f", 'a'); fwrite($list, "INSERT INTO `moviedb`.`movies` (`movie_id` , `movie_fn` , `movie_path` , `movie_title` , `movie_date` , `movie_actors` , `movie_about` , `movie_length` , `movie_size` , `movie_rating` , `movie_plot` , `movie_type`, `movie_when`, `movie_quality`) VALUESn"); fclose($list); while ($file = readdir($dir_handle)) { if ($file == '..' or $file == '.') { } else { { $var1 = explode('.', $file); if ($var1[1] == avi or $var1[1] == mp4 or $var1[1] == mpg or $var1[1] == mkv) { $ext = $var1[1]; $sqlfn = $file; $file = str_replace(".avi", " ", $file); $file = str_replace(".mp4", " ", $file); $file = str_replace(".mpg", " ", $file); $file = str_replace(".mkv", " ", $file); $file = str_replace("-", " ", $file); $file = str_replace("_", " ", $file); $fn = mysql_query("SELECT * FROM `movies` WHERE `movie_fn` LIKE '$sqlfn'") or die(mysql_error()); $row = mysql_fetch_array($fn); if ($row[movie_fn] == $sqlfn) { // echo "Already got this one!"; } else { echo "Test one passed: Attempting to fetch information for title: $sqlfn...n"; $m = new MediaInfo(); $info = $m->getMovieInfo("$file", "$sqlfn"); } } else { //echo "Test failed: Skipping $filenn"; } } } } function consize($fs) { if ($fs >= 1073741824) $fs = round($fs / 1073741824 * 100) / 100 . " Gb"; elseif ($fs >= 1048576) $fs = round($fs / 1048576 * 100) / 100 . " Mb"; elseif ($fs >= 1024) $fs = round($fs / 1024 * 100) / 100 . " Kb"; else $fs = $fs . " b"; return $fs; } function fixapos($in) { $str = $in; if (stristr($in, "'")) { $in = str_replace("'", "''", "$in"); } if (stristr("$in", "'")) { $in = str_replace("'", "''", "$in"); } if (stristr($in, "'")) { $in = str_replace("'", "''", "$in"); } $in = html_entity_decode($in, ENT_QUOTES, "ISO-8859-1"); $in = preg_replace('/&#(d+);/me', "chr(1)", $in); $in = preg_replace('/&#x([a-f0-9]+);/mei', "chr(0x1)", $in); return $in; } class MediaInfo { public $info; function __construct($str = null) { if (!is_null($str)) $this->autodetect($str); } function autodetect($str) { // Attempt to cleanup $str in case it's a filename ;-) $str = pathinfo($str, PATHINFO_FILENAME); $str = $this->normalize($str); // Is it a movie or tv show? if (preg_match('/s[0-9][0-9]?.?e[0-9][0-9]?/i', $str) == 1) $this->info = $this->getEpisodeInfo($str); else $this->info = $this->getMovieInfo($str); return $this->info; } function getEpisodeInfo($str) { $arr = array(); $arr['kind'] = 'tv'; return $arr; } function getMovieInfo($str, $file) { $str = str_ireplace('the ', '', $str); $url = "http://www.google.com/search?hl=en&q=imdb+" . urlencode($str) . "&btnI=I%27m+Feeling+Lucky"; $html = $this->geturl($url); if (stripos($html, "302 Moved") !== false) $html = $this->geturl($this->match('/HREF="(.*?)"/ms', $html, 1)); $arr = array(); $arr['kind'] = 'movie'; $arr['id'] = $this->match('/poster.*?(tt[0-9]+)/ms', $html, 1); $arr['title'] = $this->match('/<title>(.*?)</title>/ms', $html, 1); $arr['title'] = preg_replace('/([0-9]+)/', '', $arr['title']); $arr['title'] = trim($arr['title']); $arr['rating'] = $this->match('/([0-9].[0-9])/10/ms', $html, 1); $arr['director'] = trim(strip_tags($this->match('/Director:(.*?)</a>/ms', $html, 1))); $arr['release_date'] = $this->match('/([0-9][0-9]? (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)[0-9][0-9])/ms', $html, 1); $arr['plot'] = trim(strip_tags($this->match('/Users:.*?<p>(.*?)(</p>|<a)/ms', $html, 1))); $arr['storyline'] = trim(strip_tags($this->match('/Storyline</h2>(.*?)(<em|</p>|<span)/ms', $html, 1))); $arr['runtime'] = trim($this->match('/Runtime:</h4>.*?([0-9]+) min.*?</div>/ms', $html, 1)); $arr['genres'] = array(); foreach ($this->match_all('/<a.*?>(.*?)</a>/ms', $this->match('/Genre.?:(.*?)(</div>|See more)/ms', $html, 1), 1) as $m) { array_push($arr['genres'], $m); } $arr['mpaa_rating'] = $this->match('/infobar">.<img.*?alt="(.*?)".*?>/ms', $html, 1); $arr['cast'] = array(); foreach ($this->match_all('/<td class="name">(.*?)</td>/ms', $html, 1) as $m) { array_push($arr['cast'], trim(strip_tags($m))); } $arr['poster'] = $this->match('/(http://ia.media-imdb.com/images.*?)" /></a>/ms', $html, 1); $arr['poster_large'] = ""; $arr['poster_small'] = ""; if ($arr['poster'] != '' && strrpos($arr['poster'], "nopicture") === false) { $arr['poster_large'] = substr($arr['poster'], 0, strrpos($arr['poster'], "_V1.")) . "_V1._SY500.jpg"; $arr['poster_small'] = substr($arr['poster'], 0, strrpos($arr['poster'], "_V1.")) . "_V1._SY150.jpg"; } else { $arr['poster'] = ""; } $poster = explode('=', $arr[poster_large]); $this->poster = str_replace("'", "", $poster[2]); $fn = mysql_query("SELECT * FROM `movies` WHERE `movie_fn` LIKE '$file'") or die(mysql_error()); $row = mysql_fetch_array($fn); if (!$row[movie_id]) { $doid = rand(600000000, 1000000000); } else { $doid = $row[movie_id]; } global $cover_path; $fnp = "$cover_path/" . $doid . ".jpg"; global $ext; if ($this->poster) { if ($fp = fopen($this->poster, 'r')) { $content = ''; while ($line = fread($fp, 1024)) { $content .= $line; } $fw = fopen("$fnp", 'w'); fwrite($fw, $content); fclose($fp); global $path; global $sqlfn; // $arr[size] = consize(filesize("$path/$sqlfn")); $arr[size] = consize(shell_exec("du --block-size 1 $path/$sqlfn | awk '{print $1}'")); $arr[title] = str_replace("'", "''", $arr[title]); $arr[title] = str_replace(" - IMDb", "", $arr[title]); $arr[whenadd] = exec('date +%s'); $genie = $arr['genres']['0'] . " " . $arr['genres']['1'] . " " . $arr['genres']['2']; if ($ext == 'mkv' or $ext == 'mp4') { $arr['movie_quality'] = 'HD'; } elseif ($ext == 'avi' or $ext == 'mpg') { $arr['movie_quality'] = 'SD'; } else { $arr['movie_quality'] = '?'; } $i = 0; while ($i <= count($arr[cast])) { $varx .= $arr[cast][$i] . "&"; $i++; } $sql = "('$doid' , '$file' ,'/movies/movies' ,'" . fixapos($arr[title]) . "' ,'" . fixapos($arr[release_date]) . "','" . fixapos($varx) . "','" . fixapos($arr[plot]) . "','" . fixapos("$arr[runtime] minutes") . "','" . fixapos($arr[size]) . "','" . fixapos($arr[mpaa_rating]) . "','" . fixapos($arr[storyline]) . "','" . fixapos($genie) . "','" . fixapos($arr[whenadd]) . "','" . $arr[movie_quality] . "'),n"; global $data_f; $list = fopen("$data_f", 'a'); fwrite($list, "$sqln") or die("Error writing file $sql"); echo "<-->Successfully wrote data for title: $arr[title]n"; fclose($list); } else { //echo "Failed: an error occured when trying to open the specified url $imagenn"; } } else { //echo "Failed: an error while fetching: $filen"; } return $arr; } // **************************************************************** function normalize($str) { $str = str_replace('_', ' ', $str); $str = str_replace('.', ' ', $str); $str = preg_replace('/ +/', ' ', $str); return $str; } function geturl($url, $username = null, $password = null) { $ch = curl_init(); if (!is_null($username) && !is_null($password)) curl_setopt($ch, CURLOPT_HTTPHEADER, array('Authorization: Basic ' . base64_encode("$username:$password"))); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); $html = curl_exec($ch); curl_close($ch); return $html; } function match_all($regex, $str, $i = 0) { if (preg_match_all($regex, $str, $matches) === false) return false; else return $matches[$i]; } function match($regex, $str, $i = 0) { if (preg_match($regex, $str, $match) == 1) return $match[$i]; else return false; } } global $path //uncomment this if u want this to happen automatically. echo shell_exec(./do_thumbs.sh $path); mysql_close($conn); ?> <----bash file-----> Create a file called: do_thumbs.sh chmod +x do_thumbs.sh I wouldn't recommend removing comments until you verify that this works for you.. Adjust as needed, also, you need to verify you have: imagemagick.. http://www.imagemagick.org/script/index.php #!/bin/bash if [ $(whoami) != root ]; then echo "<-> You can only run this as root.." exit 1 fi if [ -z $1 ]; then echo "missing argument" exit 1 fi if [ ! -d $1 ]; then echo "<-> directory does not exist: $1" exit 1 fi #temporary write directory path=${1}/temp #original directory origpath=${1}/covers #thumbnail directory thumbpath=${1}/covers/thumbs # make sure these directories exist! if [ ! -d $path ]; then echo "<-> Creating directory: $path" #mkdir $path fi if [ ! -d $origpath ]; then echo "<-> Creating directory: $origpath" #mkdir $origpath fi if [ ! -d $thumbpath ]; then echo "<-> Creating directory: $thumbpath" #mkdir $thumbpath fi i=0 for file in $(ls $path) do echo "<--> Processing: $file 89x131" #convert -resize 89x131 $path/$file $thumbpath/$file echo "<----> Created THUMBNAIL: $thumbpath/$file" #mv $path/$file $origpath/$file echo "<----> moving ORIGINAL: $path/$file -> $origpath/$file" i=$[i+1] done echo "<!--> Finished. $i files processed."