Commit c4600fc4 authored by camille garcin's avatar camille garcin
Browse files

Plantnet dataset with up to date images

parent 771d5991
# plantnet_dataset
# The plantnet dataset
Download a dataset of plants, with more than 100.000 images and thousands of classes
\ No newline at end of file
The plantnet dataset is comprised of 1081 species of plants,
representing a total of 306293 images, split in a train, validation and test set with proportion 80/10/10% repesctively.
## Installation
First clone the project. For the download you just need tqdm, matplotlib and requests. If you have conda you can run :
```bash
conda env create -f plantnet_env.yml
conda activate plantnet_env
```
## Downloading the dataset
To donwload the dataset, run :
```bash
python dl_plantnet.py --root=your_path --num_workers=4
```
where your_path is the path where you want to save the dataset and num_workers represents the number of threads to use to donwload the dataset
import os
import pickle
import logging
import argparse
from plantnet_utils import get_downloaded_hashes, try_open, \
dl_from_list, post_processing
parser = argparse.ArgumentParser()
parser.add_argument('--root', help='directory path in which to place data')
parser.add_argument('--max_workers', type=int, help='number of threads for downloading images, '
'see concurrent.futures.ThreadPoolExecutor for more info')
parser.add_argument('--resume_dl', action='store_true', help='If for some reason your dl was interrupted, use this option')
args = parser.parse_args()
with open('hash_id_list.pkl', 'rb') as f:
hash_id_list = pickle.load(f)
with open('hash_to_split.pkl', 'rb') as f:
hash_to_split = pickle.load(f)
if args.resume_dl:
seen_hashes = get_downloaded_hashes(args.root)
print('seen_hashes : ', seen_hashes)
if args.resume_dl:
url_id_list = [(os.path.join('http://bs.floristic.org/image/m/', f'{hash_img}.jpg'), id_specy, hash_to_split[hash_img])
for (hash_img, id_specy) in hash_id_list if hash_img not in seen_hashes]
else:
url_id_list = [(os.path.join('http://bs.floristic.org/image/m/', f'{hash_img}.jpg'), id_specy, hash_to_split[hash_img])
for (hash_img, id_specy) in hash_id_list]
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
file_handler = logging.FileHandler('dl_logs.txt')
file_handler.setLevel(logging.WARNING)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
#Dowloading images
dl_from_list(url_id_list, args.root, max_workers=args.max_workers, logger=logger)
#check integrity of images after download
try_open(args.root, logger)
# Verifying that each class has minimum 4 instances; if not delete that class from dataset
post_processing(args.root, min_species=4, delete=True, logger=logger)
\ No newline at end of file
import os
import requests
# import tqdm
from tqdm import tqdm
import matplotlib.image as mpimage
from shutil import copy2, rmtree
import concurrent.futures
def get_downloaded_hashes(root):
seen_hashes = set()
for dirpath, dirnames, filenames in list(os.walk(root)):
if ('train' in dirpath) or ('val' in dirpath) or ('test' in dirpath):
for filename in filenames:
seen_hashes.add(filename.split('.')[0])
return seen_hashes
def wrapper_dl_image(logger):
def download_image(img_url, root, label, split):
if not os.path.exists(root):
os.makedirs(root, exist_ok=True)
if not os.path.exists(os.path.join(root, split, label)):
os.makedirs(os.path.join(root, split, label), exist_ok=True)
try:
img_bytes = requests.get(img_url).content
img_name = os.path.basename(img_url)
with open(os.path.join(os.path.join(root, split, label, img_name)), 'wb') as img_file:
img_file.write(img_bytes)
logger.info(f'{img_name} was downloaded...')
except Exception as e:
logger.warning(f'Download error with image {img_name} : {e}')
return download_image
def try_open(root, logger):
for dirpath, dirnames, filenames in tqdm(list(os.walk(root)), desc='checking integrity of files'):
if ('train' in dirpath) or ('val' in dirpath) or ('test' in dirpath):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
try:
loaded_np_img = mpimage.imread(filepath)
except Exception as e:
logger.warning(f'Loading error with image {filepath} : {e}')
if not os.path.exists(os.path.join(root, 'discarded_images')):
os.makedirs(os.path.join(root, 'discarded_images'), exist_ok=True)
logger.warning(f"Copying {filepath} to {os.path.join(root, 'discarded_images')}")
copy2(filepath, os.path.join(root, 'discarded_images'))
logger.warning(f"Deleting {filepath}")
os.remove(filepath)
def dl_from_list(url_id_list, root, max_workers, logger):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(tqdm(executor.map(wrapper_dl_image(logger),
list(map(lambda x: x[0], url_id_list)), [root]*len(url_id_list),
list(map(lambda x: x[1], url_id_list)),
list(map(lambda x: x[2], url_id_list))),
total=len(url_id_list)))
def post_processing(root, min_species, delete, logger):
labels = os.listdir(os.path.join(root, 'train'))
for label in labels:
n_images = len(os.listdir(os.path.join(root, 'train', label))) + len(os.listdir(os.path.join(root, 'val', label))) + len(os.listdir(os.path.join(root, 'test', label)))
if n_images >= min_species:
pass
elif 3 <= n_images < min_species:
if delete:
logger.warning(f'label {label} having less than {min_species} instances, deleting that label from dataset')
rmtree(os.path.join(root, 'train', label))
rmtree(os.path.join(root, 'val', label))
rmtree(os.path.join(root, 'test', label))
else:
logger.warning(
f'label {label} having less than {min_species} instances and delete being False, doing nothing')
pass
elif n_images < 3:
logger.warning(
f'label {label} having less than 3 instances, deleting that label from dataset')
rmtree(os.path.join(root, 'train', label))
rmtree(os.path.join(root, 'val', label))
rmtree(os.path.join(root, 'test', label))
\ No newline at end of file
import pickle
import os
import concurrent.futures
import time
import requests
from tqdm import tqdm
import logging
downloaded_images = []
for root, dirs, files in os.walk("plantnet_subset"):
for file in files:
downloaded_images.append(file)
print(len(downloaded_images))
print(len(set(downloaded_images)))
print(downloaded_images)
set_downloaded_images = set(downloaded_images)
with open('hash_id_list.pkl', 'rb') as f:
hash_id_list = pickle.load(f)
new_url_id_list = [(os.path.join('http://bs.floristic.org/image/m/', f'{hash_img}.jpg'), id_specy) for (hash_img, id_specy)
in hash_id_list if f'{hash_img}.jpg' not in set_downloaded_images]
print('url_id_list : ', new_url_id_list)
print(len(new_url_id_list))
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
file_handler = logging.FileHandler('look_for_crash.txt')
file_handler.setLevel(logging.WARNING)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
def download_image(img_url, root, label):
if not os.path.exists(root):
os.makedirs(root)
if not os.path.exists(os.path.join(root, label)):
os.makedirs(os.path.join(root, label))
try:
img_bytes = requests.get(img_url).content
img_name = img_url.split('/')[-1]
with open(os.path.join(os.path.join(root, label, img_name)), 'wb') as img_file:
img_file.write(img_bytes)
logger.info(f'{img_name} was downloaded...')
except Exception as e:
logger.warning(f'Error with image {img_name} : {e}')
def dl_from_list(url_id_list):
with concurrent.futures.ThreadPoolExecutor() as executor:
results = list(tqdm(executor.map(download_image,
list(map(lambda x: x[0], url_id_list)), ['plantnet_subset']*len(url_id_list),
list(map(lambda x: x[1], url_id_list))),
total=len(url_id_list)))
t1 = time.time()
dl_from_list(new_url_id_list)
print(time.time() - t1)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment