When Google Photo Gets Bigger

As I started using Photos and Drive in Google Colab, the capacity grew bigger and bigger, and now it’s approaching 200G, so I’m told to upgrade. I was concerned about the money I had to pay every month, but even more importantly, I was reluctant to have my personal information in Google’s hands, so I tried to upgrade the Synology NAS DS118. I had been using to DS223 using the money I earned from my overseas business trip. However, after doing some research, I was planning to buy an 8 tera hard disk and use the previously used DS118’s 4 tera hard disk, but since it was said that it would be better to have two hard disks with the same capacity if I wanted to use them in two bays, I decided to operate the two independently. Later, I planned to expand by purchasing another 8 tera hard disk.

I mainly used it for file backup, but I did some research to move Google Photos to Synology Photos provided by Synology NAS. Someone kindly posted the following information for me on the Internet. https://www.androidpolice.com/move-google-photos-synology-nas/ After using various methods, I finally decided to use Google Takeout at the end to download the photos and then upload them to Synology. However, for some reason, the time when the photo was taken was not saved regardless of whether it was downloaded or linked, so I decided to re-insert the creation date metadata of the downloaded photos and simply got help from ChatGPT.

import os
import json
import zipfile
import hashlib
from shutil import copy2, rmtree
from tempfile import mkdtemp
from PIL import Image
import piexif
from datetime import datetime
import re

def file_hash(filepath):
    """Generate MD5 hash for a file."""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def extract_zip(zip_path, extract_to):
    """Extracts zip file to the specified directory."""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def handle_json_metadata(json_path):
    """Extracts metadata from JSON file."""
    if os.path.exists(json_path):
        with open(json_path, 'r') as file:
            return json.load(file)
    return {}

def update_file_timestamps(file_path, date_time_str):
    """Update file's created and modified timestamps."""
    try:
        # The format from the JSON might not include time, so we assume a default time if missing
        date_time = datetime.strptime(date_time_str, '%Y:%m:%d %H:%M:%S')
        timestamp = date_time.timestamp()
        os.utime(file_path, (timestamp, timestamp))
    except ValueError as e:
        print(f"Error updating timestamps for {file_path}: {e}")

def update_image_metadata(image_path, metadata, file_name):
    """Update the EXIF data of an image based on metadata."""
    try:
        img = Image.open(image_path)
        exif_dict = piexif.load(img.info.get('exif', b''))
        
        date_time = metadata.get('photoTakenTime', {}).get('formatted')
        if not date_time:
            match = re.search(r'\d{8}', file_name)
            if match:
                date_time = datetime.strptime(match.group(), '%Y%m%d').strftime('%Y:%m:%d %H:%M:%S')
        
        if date_time:
            exif_dict['Exif'][piexif.ExifIFD.DateTimeOriginal] = date_time.encode()
            update_file_timestamps(image_path, date_time)

        # Correct the setting of GPS information
        if 'location' in metadata:
            lat = metadata['location'].get('latitude')
            lon = metadata['location'].get('longitude')
            if lat is not None and lon is not None:
                lat_ref = 'N' if lat >= 0 else 'S'
                lon_ref = 'E' if lon >= 0 else 'W'
                lat_deg = abs(int(lat * 1000000))
                lon_deg = abs(int(lon * 1000000))
                exif_dict['GPS'] = {
                    piexif.GPSIFD.GPSLatitudeRef: lat_ref,
                    piexif.GPSIFD.GPSLatitude: [(lat_deg, 1000000), (0, 1), (0, 1)],
                    piexif.GPSIFD.GPSLongitudeRef: lon_ref,
                    piexif.GPSIFD.GPSLongitude: [(lon_deg, 1000000), (0, 1), (0, 1)]
                }

        exif_bytes = piexif.dump(exif_dict)
        img.save(image_path, exif=exif_bytes)
        img.close()
    except Exception as e:
        print(f"Error updating metadata for {image_path}: {e}")

def collect_files(directory, extensions, target_directory, seen_hashes):
    """Walk through directory, collect and copy image/video files to the target directory avoiding duplicates."""
    for root, dirs, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            if file.endswith('.zip'):
                temp_dir = mkdtemp()
                extract_zip(filepath, temp_dir)
                collect_files(temp_dir, extensions, target_directory, seen_hashes)
                rmtree(temp_dir)
            elif any(file.endswith(ext) for ext in extensions):
                file_hash_val = file_hash(filepath)
                if file_hash_val not in seen_hashes:
                    seen_hashes.add(file_hash_val)
                    destination_path = os.path.join(target_directory, os.path.basename(filepath))
                    if not os.path.exists(destination_path):
                        copy2(filepath, destination_path)
                        if file.endswith(('.jpg', '.jpeg', '.png')):
                            json_path = os.path.splitext(filepath)[0] + '.json'
                            metadata = handle_json_metadata(json_path)
                            update_image_metadata(destination_path, metadata, file)

# Main execution
source_directories = ['Download01', 'Download02', 'Download03']  # Source directories
target_directory = 'TargetDir03'  # Destination for unique media files
file_extensions = ['.jpg', '.jpeg', '.png', '.mp4', '.mov', '.avi']  # File types
seen_hashes = set()

os.makedirs(target_directory, exist_ok=True)

for directory in source_directories:
    collect_files(directory, file_extensions, target_directory, seen_hashes)

This code was able to create “Content created” metadata as shown. As you can see, the “Created” and “Modified” metadata are at the time of download.

Looking at Synology Photo’s photo directory structure, photos are organized into directories by year, so a program to sort them by year was completed by the friendly chatGPT!

However, when I entered, the directory was organized by year and month. 🙁 First of all, I used the VPN service established earlier (when I was working at school) to connect to my home network and copy files. However, there are too many duplicate files. It seems that duplicate files have been created due to the linking and copying tasks here and there. There seems to be a way to delete duplicate files using metadata or the content of the media file, but it was difficult to extract whether the content was the same, so I tried deleting files with the same “Content created” date.

import os
import subprocess
from datetime import datetime, timedelta
from pytz import timezone

def get_content_created_date_seoul(filepath):
    """Retrieve the full 'Content Created' date and time as Seoul time."""
    try:
        cmd = ['mdls', '-name', 'kMDItemContentCreationDate', '-raw', filepath]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.stdout.strip():
            # Extract the full datetime and convert to Seoul time
            local_time = datetime.strptime(result.stdout.strip(), '%Y-%m-%d %H:%M:%S %z')
            seoul_time = local_time.astimezone(timezone('Asia/Seoul'))
            return seoul_time
    except Exception as e:
        print(f"Error retrieving metadata for {filepath}: {e}")
    return None

def find_and_remove_duplicates(directory):
    """Remove duplicates based on exact 'Content Created' timestamp and filename similarity."""
    files_by_datetime = {}
    files_to_remove = []

    # Organize files by their exact 'Content Created' datetime in Seoul time
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        content_datetime = get_content_created_date_seoul(filepath)
        if content_datetime:
            if content_datetime not in files_by_datetime:
                files_by_datetime[content_datetime] = []
            files_by_datetime[content_datetime].append(filepath)
    
    # For each datetime, remove files with longer names if they are duplicates
    for datetime_key, files in files_by_datetime.items():
        print(f"Processing files for datetime: {datetime_key}")
        if len(files) > 1:
            # Sort files by name length (shortest first)
            print(files)
            files.sort(key=lambda x: len(x))
            shortest_file = files[0]
            print(files)
            # Keep the shortest filename, mark others for deletion
            for file in files[1:2]:
                #if os.path.basename(shortest_file) in os.path.basename(file):
                    files_to_remove.append(file)
            print(files_to_remove)
    
    # Remove the marked duplicate files
    
    for file in files_to_remove:
        os.remove(file)
        print(f"Removed duplicate file: {file}")
        
    print(str(len(files_to_remove)) + " files are removed")

# Example usage
directory = '2024'
find_and_remove_duplicates(directory)

First of all, it seems to display properly in Synology Photo.

Since Synology Phto is installed on a low-end NAS, it is said that only person search is possible and object search is not possible. Google Photos certainly has the advantage of providing various search functions and automatic album creation functions. It seems that it will be possible to access the photo directory of Synology NAS to implement various analysis and photo management functions through AI, so let’s study these contents next time.

Leave a Reply

Your email address will not be published. Required fields are marked *