## pip install the transformers and datasets libraries.

In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [4]:
model_name =  'distilbert-base-uncased' #'AI-Growth-Lab/PatentSBERTa'

##  Import relevant libraries and dependencies

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

from transformers import pipeline, Trainer, TrainingArguments

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

use the `load_dataset` function to load all the patent applications that were filed to the USPTO in January 2016. We specify the date ranges of the training and validation sets as January 1-21, 2016 and January 22-31, 2016, respectively.

In [7]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

Downloading builder script:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Downloading and preparing dataset hupd/sample to /root/.cache/huggingface/datasets/HUPD___hupd/sample-4345d0959024424b/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142...
Loading dataset with config: PatentsConfig(name='sample', version=0.0.0, data_dir='sample', data_files={'train': ['https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather']}, description='Patent data from January 2016, for debugging')


Downloading data:   0%|          | 0.00/6.67M [00:00<?, ?B/s]

Using metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710


Downloading data:   0%|          | 0.00/388M [00:00<?, ?B/s]

Reading metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710
Filtering train dataset by filing start date: 2016-01-01
Filtering train dataset by filing end date: 2016-01-21
Filtering val dataset by filing start date: 2016-01-22
Filtering val dataset by filing end date: 2016-01-31


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset hupd downloaded and prepared to /root/.cache/huggingface/datasets/HUPD___hupd/sample-4345d0959024424b/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Loading is done!


In [8]:
dataset_dict.shape

{'train': (16153, 14), 'validation': (9094, 14)}

In [9]:
validation_dict = dataset_dict['validation']
print(len(validation_dict))

9094


In [10]:
validation_dict[:1]

{'patent_number': ['13144833'],
 'decision': ['REJECTED'],
 'title': ['ROSACEA TREATMENTS AND KITS FOR PERFORMING THEM'],
 'abstract': ['Regimen for the treatment of rosacea include the application of an anti-redness composition to at least a portion of the cleansed area of skin afflicted with rosacea. The regimen may include the application of one or more of a polymetal complex, a composition containing metronidazole, and/or a protective composition. Kits containing components useful in performing such regimens are also described.'],
 'claims': ['1. A treatment regimen comprising: cleansing at least a portion of an area of skin afflicted with rosacea with an antimicrobial or cleanser; applying an anti-redness composition to at least a portion of the cleansed area; and applying a protective composition to at least a portion of the cleansed, and moisturized area. 2. A treatment regimen as in claim 1 further comprising the step of applying a composition containing metronidazole to at lea

In [17]:
keys = list(validation_dict[:1].keys())

In [18]:
keys

['patent_number',
 'decision',
 'title',
 'abstract',
 'claims',
 'background',
 'summary',
 'description',
 'cpc_label',
 'ipc_label',
 'filing_date',
 'patent_issue_date',
 'date_published',
 'examiner_id']

In [20]:
type(validation_dict[:1])

dict

## save to csv file

In [21]:
import csv

with open('patent_application.csv', 'w') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames = keys)
	writer.writeheader()
	writer.writerows([validation_dict[:1]])

In [22]:
import pandas as pd

df = pd.read_csv('patent_application.csv')

In [23]:
df

Unnamed: 0,patent_number,decision,title,abstract,claims,background,summary,description,cpc_label,ipc_label,filing_date,patent_issue_date,date_published,examiner_id
0,['13144833'],['REJECTED'],['ROSACEA TREATMENTS AND KITS FOR PERFORMING T...,['Regimen for the treatment of rosacea include...,['1. A treatment regimen comprising: cleansing...,['<SOH> BACKGROUND <EOH>Rosacea is a chronic i...,['<SOH> SUMMARY <EOH>The present disclosure pr...,"[""CROSS REFERENCE TO RELATED APPLICATION This ...",['A61K3334'],['A61K3334'],['20160122'],[''],['20160630'],['75390.0']
