Hello,
This is my first post and also my first time using django so please bear with me
I’m making a book recommender system and want to import some book metadata from this dataset. There are 2 gzipped JSON files, where every line is an object. One contains data about the books, the other is author data.
Here are the models:
class Author(models.Model):
author_id = models.IntegerField(primary_key=True)
name = models.CharField(max_length=50)
def __str__(self):
return self.name
class Book(models.Model):
book_id = models.IntegerField(primary_key=True)
title = models.CharField(max_length=50)
authors = models.ManyToManyField(Author, through='AuthorRole')
description = models.TextField()
average_rating = models.FloatField()
image_url = models.CharField(max_length=100)
def __str__(self):
return self.title
def goodreads_url(self):
return 'https://www.goodreads.com/book/show/{book_id}/'.format(book_id=self.book_id)
class AuthorRole(models.Model):
author = models.ForeignKey(Author, on_delete=models.CASCADE)
book = models.ForeignKey(Book, on_delete=models.CASCADE)
role = models.CharField(max_length=20)
def __str__(self):
return self.role
I’ve also made a custom manage command to populate the database with data from the dataset, but it’s running painfully slowly (in fact it’s not done yet and I ran it like 2 hours ago…)
import json
import gzip
import re
from django.core.management.base import BaseCommand, CommandError
from books.models import Author, Book, AuthorRole
def parse_json(path):
with gzip.open(path) as gz:
for line in gz:
yield json.loads(line)
class Command(BaseCommand):
help = 'Imports book and author data from the goodreads dataset available here: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home'
def add_arguments(self, parser):
parser.add_argument('--authors-path', type=str, help='path to the authors file')
parser.add_argument('--books-path', type=str, help='path to the books file')
def handle(self, *args, **options):
Author.objects.all().delete()
Book.objects.all().delete()
authors_path = options['authors_path']
books_path = options['books_path']
for author in parse_json(authors_path):
author_object = Author.objects.create(author_id=int(author['author_id']), name=re.sub(r'\s\s+', ' ', author['name']))
author_object.save()
for book in parse_json(books_path):
book_id = int(book['book_id'])
title = book['title']
description = book['description']
authors = book['authors']
average_rating = 0.0 if book['average_rating'] == '' else float(book['average_rating'])
image_url = book['image_url']
book_object = Book.objects.create(book_id=book_id, title=title, description=description, average_rating=average_rating, image_url=image_url)
for author in authors:
role = AuthorRole(book=book_object, author=Author.objects.get(pk=int(author['author_id'])), role=author['role'])
role.save()
Is there any way to do this that will not take 3 days?