Skip to content

Data sources

WikipediaLoader

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class WikipediaLoader(BaseWebAPIDataLoader):
    def __init__(self):
        super().__init__("https://en.wikipedia.org/w/api.php")

    def fetch_data(self, search_query, results=10, language="en"):
        """
        Fetches data from the Wikipedia API.
        Args:
          search_query (str): The query to search for.
          results (int, optional): The maximum number of results to return. Defaults to 10.
          language (str, optional): The language to search in. Defaults to "en".
        Returns:
          list: A list of dictionaries containing the data for each result.
        Raises:
          wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
        Examples:
          >>> loader = WikipediaLoader()
          >>> loader.fetch_data("Python")
          [
            {
              "title": "Python (programming language)",
              "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
              "summary": "Python is an interpreted, high-level, general-purpose programming language.",
              "content": "Python is an interpreted, high-level, general-purpose programming language...",
              "categories": ["Programming languages"],
              "references": ["https://www.python.org/"]
            }
          ]
        """
        wikipedia.set_lang(language)
        wikipedia.set_rate_limiting(True)

        search_results = wikipedia.search(search_query, results=results)
        data = []

        for result in search_results:
            try:
                page = wikipedia.page(result)
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
            except wikipedia.exceptions.DisambiguationError as e:
                # Handle disambiguation pages by selecting the first option
                if e.options:
                    page = wikipedia.page(e.options[0])
                    data.append(
                        {
                            "title": page.title,
                            "url": page.url,
                            "summary": page.summary,
                            "content": page.content,
                            "categories": page.categories,
                            "references": page.references,
                        }
                    )
            except wikipedia.exceptions.PageError:
                # Skip pages that cannot be found
                continue

        return data

fetch_data(search_query, results=10, language='en')

Fetches data from the Wikipedia API.

Parameters:

Name Type Description Default
search_query str

The query to search for.

required
results int

The maximum number of results to return. Defaults to 10.

10
language str

The language to search in. Defaults to "en".

'en'

Returns:

Name Type Description
list

A list of dictionaries containing the data for each result.

Raises:

Type Description
wikipedia.exceptions.DisambiguationError

If the search query returns a disambiguation page.

Examples:

>>> loader = WikipediaLoader()
>>> loader.fetch_data("Python")
[
  {
    "title": "Python (programming language)",
    "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "summary": "Python is an interpreted, high-level, general-purpose programming language.",
    "content": "Python is an interpreted, high-level, general-purpose programming language...",
    "categories": ["Programming languages"],
    "references": ["https://www.python.org/"]
  }
]
Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def fetch_data(self, search_query, results=10, language="en"):
    """
    Fetches data from the Wikipedia API.
    Args:
      search_query (str): The query to search for.
      results (int, optional): The maximum number of results to return. Defaults to 10.
      language (str, optional): The language to search in. Defaults to "en".
    Returns:
      list: A list of dictionaries containing the data for each result.
    Raises:
      wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
    Examples:
      >>> loader = WikipediaLoader()
      >>> loader.fetch_data("Python")
      [
        {
          "title": "Python (programming language)",
          "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
          "summary": "Python is an interpreted, high-level, general-purpose programming language.",
          "content": "Python is an interpreted, high-level, general-purpose programming language...",
          "categories": ["Programming languages"],
          "references": ["https://www.python.org/"]
        }
      ]
    """
    wikipedia.set_lang(language)
    wikipedia.set_rate_limiting(True)

    search_results = wikipedia.search(search_query, results=results)
    data = []

    for result in search_results:
        try:
            page = wikipedia.page(result)
            data.append(
                {
                    "title": page.title,
                    "url": page.url,
                    "summary": page.summary,
                    "content": page.content,
                    "categories": page.categories,
                    "references": page.references,
                }
            )
        except wikipedia.exceptions.DisambiguationError as e:
            # Handle disambiguation pages by selecting the first option
            if e.options:
                page = wikipedia.page(e.options[0])
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
        except wikipedia.exceptions.PageError:
            # Skip pages that cannot be found
            continue

    return data

SemanticScholarLoader

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class SemanticScholarLoader(BaseWebAPIDataLoader):
    def __init__(self):
        """
        Initializes the SemanticScholarLoader class.
        Args:
          None
        Returns:
          None
        Notes:
          Calls the superclass constructor with the SemanticScholar API URL.
        """
        super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

    def fetch_data(self, search_query, limit=100, year_range=None):
        """
        Fetches data from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
        Returns:
          list: A list of paper objects.
        Examples:
          >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        params = {
            "query": search_query,
            "limit": limit,
            "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
        }

        if year_range is not None:
            params["year"] = year_range

        data = self.make_request("", params=params)
        return data.get("data", [])

    def fetch_and_sort_papers(
        self,
        search_query,
        limit=100,
        top_n=20,
        year_range=None,
        keyword_combinations=None,
        weight_similarity=0.5,
    ):
        """
        Fetches and sorts papers from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
          keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
          weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
        Returns:
          list: A list of the top `top_n` paper objects sorted by combined score.
        Examples:
          >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        papers = []
        if keyword_combinations is None:
            keyword_combinations = [search_query]

        for combination in keyword_combinations:
            papers.extend(self.fetch_data(combination, limit, year_range))

        max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

        for paper in papers:
            similarity = jellyfish.jaro_similarity(search_query, paper["title"])
            normalized_citation_count = paper["citationCount"] / max_citations
            paper["combined_score"] = (weight_similarity * similarity) + (
                (1 - weight_similarity) * normalized_citation_count
            )

        sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

        # deduplicate paper entries prior to taking top n results
        sorted_dedup_papers = list(
            {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
        )

        return sorted_dedup_papers[:top_n]

__init__()

Initializes the SemanticScholarLoader class.

Returns:

Type Description

None

Notes

Calls the superclass constructor with the SemanticScholar API URL.

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
 8
 9
10
11
12
13
14
15
16
17
18
def __init__(self):
    """
    Initializes the SemanticScholarLoader class.
    Args:
      None
    Returns:
      None
    Notes:
      Calls the superclass constructor with the SemanticScholar API URL.
    """
    super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

fetch_and_sort_papers(search_query, limit=100, top_n=20, year_range=None, keyword_combinations=None, weight_similarity=0.5)

Fetches and sorts papers from the SemanticScholar API.

Parameters:

Name Type Description Default
search_query str

The query to search for.

required
limit int

The maximum number of results to return. Defaults to 100.

100
top_n int

The maximum number of results to return after sorting. Defaults to 20.

20
year_range tuple

A tuple of two integers representing the start and end year of the search. Defaults to None.

None
keyword_combinations list

A list of keyword combinations to search for. Defaults to None.

None
weight_similarity float

The weight to give to the similarity score when sorting. Defaults to 0.5.

0.5

Returns:

Name Type Description
list

A list of the top top_n paper objects sorted by combined score.

Examples:

>>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
[{...}, {...}, ...]
Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def fetch_and_sort_papers(
    self,
    search_query,
    limit=100,
    top_n=20,
    year_range=None,
    keyword_combinations=None,
    weight_similarity=0.5,
):
    """
    Fetches and sorts papers from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
      keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
      weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
    Returns:
      list: A list of the top `top_n` paper objects sorted by combined score.
    Examples:
      >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    papers = []
    if keyword_combinations is None:
        keyword_combinations = [search_query]

    for combination in keyword_combinations:
        papers.extend(self.fetch_data(combination, limit, year_range))

    max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

    for paper in papers:
        similarity = jellyfish.jaro_similarity(search_query, paper["title"])
        normalized_citation_count = paper["citationCount"] / max_citations
        paper["combined_score"] = (weight_similarity * similarity) + (
            (1 - weight_similarity) * normalized_citation_count
        )

    sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

    # deduplicate paper entries prior to taking top n results
    sorted_dedup_papers = list(
        {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
    )

    return sorted_dedup_papers[:top_n]

fetch_data(search_query, limit=100, year_range=None)

Fetches data from the SemanticScholar API.

Parameters:

Name Type Description Default
search_query str

The query to search for.

required
limit int

The maximum number of results to return. Defaults to 100.

100
year_range tuple

A tuple of two integers representing the start and end year of the search. Defaults to None.

None

Returns:

Name Type Description
list

A list of paper objects.

Examples:

>>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
[{...}, {...}, ...]
Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def fetch_data(self, search_query, limit=100, year_range=None):
    """
    Fetches data from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
    Returns:
      list: A list of paper objects.
    Examples:
      >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    params = {
        "query": search_query,
        "limit": limit,
        "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
    }

    if year_range is not None:
        params["year"] = year_range

    data = self.make_request("", params=params)
    return data.get("data", [])

BaseWebAPIDataLoader

Bases: ABC

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class BaseWebAPIDataLoader(ABC):
    def __init__(self, base_url):
        self.base_url = base_url

    @abstractmethod
    def fetch_data(self, search_query, **kwargs):
        """
        Fetches data from the API.
        Args:
          search_query (str): The search query to use.
          **kwargs: Additional keyword arguments to pass to the API.
        Returns:
          dict: The response from the API.
        Raises:
          NotImplementedError: If the method is not implemented.
        """
        pass

    def make_request(self, endpoint, params=None):
        """
        Makes a request to the API.
        Args:
          endpoint (str): The API endpoint to make the request to.
          params (dict, optional): Additional parameters to pass to the API. Defaults to None.
        Returns:
          dict: The response from the API.
        Raises:
          Exception: If the request fails.
        """
        url = f"{self.base_url}{endpoint}"
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            return data
        else:
            raise Exception(f"Failed to fetch data from API: {response.status_code}")

fetch_data(search_query, **kwargs) abstractmethod

Fetches data from the API.

Parameters:

Name Type Description Default
search_query str

The search query to use.

required
**kwargs

Additional keyword arguments to pass to the API.

{}

Returns:

Name Type Description
dict

The response from the API.

Raises:

Type Description
NotImplementedError

If the method is not implemented.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
@abstractmethod
def fetch_data(self, search_query, **kwargs):
    """
    Fetches data from the API.
    Args:
      search_query (str): The search query to use.
      **kwargs: Additional keyword arguments to pass to the API.
    Returns:
      dict: The response from the API.
    Raises:
      NotImplementedError: If the method is not implemented.
    """
    pass

make_request(endpoint, params=None)

Makes a request to the API.

Parameters:

Name Type Description Default
endpoint str

The API endpoint to make the request to.

required
params dict

Additional parameters to pass to the API. Defaults to None.

None

Returns:

Name Type Description
dict

The response from the API.

Raises:

Type Description
Exception

If the request fails.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def make_request(self, endpoint, params=None):
    """
    Makes a request to the API.
    Args:
      endpoint (str): The API endpoint to make the request to.
      params (dict, optional): Additional parameters to pass to the API. Defaults to None.
    Returns:
      dict: The response from the API.
    Raises:
      Exception: If the request fails.
    """
    url = f"{self.base_url}{endpoint}"
    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        return data
    else:
        raise Exception(f"Failed to fetch data from API: {response.status_code}")