WikiArt는 전 세계의 예술 작품 정보를 수집하고 제공하는 비영리 예술 데이터베이스 웹사이트이다.
WikiArt는 사용자가 온라인으로 무료로 액세스 할 수있는 예술 작품의 이미지, 정보 및 전반적인 문화 정보를 제공함.
이런 결과물이 나오는 모델을 학습하기 위해 데이터셋이 필요한데 WikiArt에 있는 예술작품 이미지들을 한꺼번에 크롤링하여 이미지를 크롭하고 모델에 학습시켜보려고함.
이미지 크롭하기 포스팅
해당 코드는 WikiArt에 Abstract 키워드로 분류된 작품들을 다운받는 코드이다.
#import urllib
import urllib.request
import re
from bs4 import BeautifulSoup
import time
file_path = "art/wikiart"
base_url = "https://www.wikiart.org"
# iterate through all artists by last name alphabetically
for c in range(ord('a'), ord('z')+1):
char = chr(c)
artist_list_url = base_url + '/en/Alphabet/' + char + '/text-list'
genre_soup = BeautifulSoup(urllib.request.urlopen(artist_list_url), "lxml")
artist_list_main = genre_soup.find("main")
lis = artist_list_main.find_all("li")
# for each list element
for li in lis:
born = 0
died = 0
# get the date range
for line in li.text.splitlines():
if line.startswith(",") and "-" in line:
parts = line.split('-')
if len(parts) == 2:
born = int(re.sub("[^0-9]", "",parts[0]))
died = int(re.sub("[^0-9]", "",parts[1]))
# look for artists who may have created work that could in public domain
if born>1850 and died>0 and (born<1900 or died<1950):
link = li.find("a")
artist = link.attrs["href"]
if artist == "/en/salvador-dali": # skip Dali
# get the artist's main page
artist_url = base_url + artist
artist_soup = BeautifulSoup(urllib.request.urlopen(artist_url), "lxml")
# only look for artists with the word abstract on their main page
if "Abstract" in artist_soup.text or "abstract" in artist_soup.text or "Avant-garde" \
in artist_soup.text or "avant-garde" in artist_soup.text:
print(artist + " " + str(born) + " - " + str(died))
# get the artist's web page for the artwork
url = base_url + artist + '/all-works/text-list'
artist_work_soup = BeautifulSoup(urllib.request.urlopen(url), "lxml")
# get the main section
artist_main = artist_work_soup.find("main")
image_count = 0
artist_name = artist.split("/")[2]
# get the list of artwork
lis = artist_main.find_all("li")
# for each list element
for li in lis:
link = li.find("a")
if link != None:
painting = link.attrs["href"]
# get the painting
url = base_url + painting
painting_soup = BeautifulSoup(urllib.request.urlopen(url), "lxml")
print("error retreiving page")
# check the copyright
if "Public domain" in painting_soup.text:
#check the genre
genre = painting_soup.find("span", {"itemprop":"genre"})
if genre != None and genre.text == "abstract":
# get the url
og_image = painting_soup.find("meta", {"property":"og:image"})
image_url = og_image["content"].split("!")[0] # ignore the !Large.jpg at the end
save_path = file_path + "/" + artist_name + "_" + str(image_count) + ".jpg"
#download the file
print("downloading to " + save_path)
time.sleep(0.2) # try not to get a 403
urllib.request.urlretrieve(image_url, save_path)
image_count = image_count + 1
except Exception as e:
print("failed downloading " + image_url, e)
'개발이야기 > Python' 카테고리의 다른 글
CondaSSLError : openssl appears to be unavailable on this machine. openssl is required to download and install packages. (0) | 2023.02.26 |
ChatGPT로 파이썬 코드 짜보기(이미지 자르기) (0) | 2023.02.18 |